diff --git a/crates/sandlock-core/src/cow/dispatch.rs b/crates/sandlock-core/src/cow/dispatch.rs index a183f46..98ed9f0 100644 --- a/crates/sandlock-core/src/cow/dispatch.rs +++ b/crates/sandlock-core/src/cow/dispatch.rs @@ -8,14 +8,39 @@ use std::path::{Component, Path, PathBuf}; use std::sync::Arc; use tokio::sync::Mutex; +use tokio::sync::Mutex as AsyncMutex; use crate::arch; use crate::cow::seccomp::SeccompCowBranch; use crate::procfs::{build_dirent64, DT_DIR, DT_LNK, DT_REG}; use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction}; -use crate::seccomp::state::{CowState, PidKey}; +use crate::seccomp::state::{CowState, PerProcessState, ProcessIndex}; use crate::sys::structs::SeccompNotif; +/// Acquire the per-process state handle for `notif.pid`. Returns +/// None if the pid isn't tracked (pidfd_open failed at fork on an +/// old kernel, or the process is gone) — callers should fall back +/// to `NotifAction::Continue`. +fn pp_handle( + processes: &Arc, + pid: u32, +) -> Option>> { + processes + .entry_for(i32::try_from(pid).ok()?) + .map(|(_, s)| s) +} + +/// Read the current virtual cwd for `pid` (None if the process +/// hasn't chdir'd into a COW-only directory, or isn't tracked). +async fn current_virtual_cwd( + processes: &Arc, + pid: u32, +) -> Option { + let handle = pp_handle(processes, pid)?; + let cwd = handle.lock().await.virtual_cwd.clone(); + cwd +} + /// Read a NUL-terminated path from child memory (up to 4096 bytes for filesystem paths). /// /// Reads page-by-page to avoid crossing into unmapped memory (e.g. when the path @@ -100,29 +125,6 @@ fn map_cow_upper_path(cow: &SeccompCowBranch, path: &str) -> String { normalize_path(path).to_string_lossy().into_owned() } -fn read_pid_start_time(pid: u32) -> Option { - let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?; - let rest = stat.rsplit_once(") ")?.1; - // starttime is field 22; after "pid (comm)" the first token is field 3. - rest.split_whitespace().nth(19)?.parse().ok() -} - -fn cow_pid_key(pid: u32) -> Option { - Some(PidKey { - pid: i32::try_from(pid).ok()?, - start_time: read_pid_start_time(pid)?, - }) -} - -fn current_virtual_cwd(st: &mut CowState, pid: u32) -> Option { - if st.virtual_cwds.is_empty() { - return None; - } - let pid_key = cow_pid_key(pid)?; - st.prune_reused_pid(pid_key); - st.virtual_cwds.get(&pid_key).cloned() -} - // ============================================================ // openat handler // ============================================================ @@ -132,6 +134,7 @@ fn current_virtual_cwd(st: &mut CowState, pid: u32) -> Option { pub(crate) async fn handle_cow_open( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { use crate::cow::seccomp::CowOpenPlan; @@ -151,8 +154,7 @@ pub(crate) async fn handle_cow_open( None => return NotifAction::Continue, }; let virtual_cwd = if (dirfd as i32) == libc::AT_FDCWD && !Path::new(&rel_path).is_absolute() { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) + current_virtual_cwd(processes, notif.pid).await } else { None }; @@ -476,12 +478,10 @@ async fn execute_deferred_copy( pub(crate) async fn handle_cow_write( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let mut op = match parse_cow_write(notif, notif_fd, virtual_cwd.as_deref()) { Some(op) => op, None => return NotifAction::Continue, @@ -573,17 +573,15 @@ pub(crate) const SYS_FACCESSAT2: i64 = 439; pub(crate) async fn handle_cow_access( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let nr = notif.data.nr as i64; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; // access(pathname, mode): args[0]=path, args[1]=mode // faccessat(dirfd, pathname, mode, flags): args[0]=dirfd, args[1]=path, args[2]=mode let (path, mode) = if Some(nr) == arch::SYS_ACCESS { - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; let p = match read_path(notif, notif.data.args[0], notif_fd) { Some(p) => resolve_at_path_with_virtual( notif, @@ -596,10 +594,6 @@ pub(crate) async fn handle_cow_access( (p, notif.data.args[1] as i32) } else { let dirfd = notif.data.args[0] as i64; - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; let p = match read_path(notif, notif.data.args[1], notif_fd) { Some(p) => resolve_at_path_with_virtual(notif, dirfd, &p, virtual_cwd.as_deref()), None => return NotifAction::Continue, @@ -642,6 +636,7 @@ pub(crate) async fn handle_cow_access( pub(crate) async fn handle_cow_utimensat( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let dirfd = notif.data.args[0] as i64; @@ -653,10 +648,7 @@ pub(crate) async fn handle_cow_utimensat( return NotifAction::Continue; } - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let path = match read_path(notif, path_ptr, notif_fd) { Some(p) => resolve_at_path_with_virtual(notif, dirfd, &p, virtual_cwd.as_deref()), None => return NotifAction::Continue, @@ -717,6 +709,7 @@ pub(crate) async fn handle_cow_utimensat( pub(crate) async fn handle_cow_stat( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let nr = notif.data.nr as i64; @@ -724,10 +717,7 @@ pub(crate) async fn handle_cow_stat( // newfstatat(dirfd, pathname, statbuf, flags) // faccessat(dirfd, pathname, mode, flags) let dirfd = notif.data.args[0] as i64; - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let path = match read_path(notif, notif.data.args[1], notif_fd) { Some(p) => resolve_at_path_with_virtual(notif, dirfd, &p, virtual_cwd.as_deref()), None => return NotifAction::Continue, @@ -794,14 +784,12 @@ pub(crate) async fn handle_cow_stat( pub(crate) async fn handle_cow_statx( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { // statx(dirfd, pathname, flags, mask, statxbuf) let dirfd = notif.data.args[0] as i64; - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let path = match read_path(notif, notif.data.args[1], notif_fd) { Some(p) => resolve_at_path_with_virtual(notif, dirfd, &p, virtual_cwd.as_deref()), None => return NotifAction::Continue, @@ -828,14 +816,12 @@ pub(crate) async fn handle_cow_statx( pub(crate) async fn handle_cow_readlink( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { // readlinkat(dirfd, pathname, buf, bufsiz) let dirfd = notif.data.args[0] as i64; - let virtual_cwd = { - let mut st = cow_state.lock().await; - current_virtual_cwd(&mut st, notif.pid) - }; + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let path = match read_path(notif, notif.data.args[1], notif_fd) { Some(p) => resolve_at_path_with_virtual(notif, dirfd, &p, virtual_cwd.as_deref()), None => return NotifAction::Continue, @@ -875,96 +861,107 @@ pub(crate) async fn handle_cow_readlink( pub(crate) async fn handle_cow_getdents( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let pid = notif.pid; let child_fd = (notif.data.args[0] & 0xFFFFFFFF) as u32; let buf_addr = notif.data.args[1]; let buf_size = (notif.data.args[2] & 0xFFFFFFFF) as usize; - let pid_key = match cow_pid_key(pid) { - Some(key) => key, - None => return NotifAction::Continue, - }; - // Check if fd points to a COW-managed directory + // Check if fd points to a COW-managed directory. let link_path = format!("/proc/{}/fd/{}", pid, child_fd); let target = match std::fs::read_link(&link_path) { Ok(t) => t.to_string_lossy().into_owned(), Err(_) => return NotifAction::Continue, }; - let mut st = cow_state.lock().await; - st.prune_reused_pid(pid_key); - let cow = match st.branch.as_ref() { - Some(c) => c, - None => return NotifAction::Continue, - }; - - if !cow.has_changes() { - return NotifAction::Continue; - } - - let target_path = Path::new(&target); - let rel_path = if cow.matches(&target) { - cow.safe_rel(&target).unwrap_or_else(|| ".".to_string()) - } else if let Ok(rel) = target_path.strip_prefix(cow.upper_dir()) { - let rel = rel.to_string_lossy(); - if rel.is_empty() { - ".".to_string() + // Compute rel_path under the global COW lock, but do not hold it + // across the per-process lock acquired below. + let rel_path = { + let st = cow_state.lock().await; + let cow = match st.branch.as_ref() { + Some(c) => c, + None => return NotifAction::Continue, + }; + if !cow.has_changes() { + return NotifAction::Continue; + } + let target_path = Path::new(&target); + if cow.matches(&target) { + cow.safe_rel(&target).unwrap_or_else(|| ".".to_string()) + } else if let Ok(rel) = target_path.strip_prefix(cow.upper_dir()) { + let rel = rel.to_string_lossy(); + if rel.is_empty() { + ".".to_string() + } else { + rel.into_owned() + } } else { - rel.into_owned() + return NotifAction::Continue; } - } else { - return NotifAction::Continue; }; - // Build cache on first call; invalidate if fd was reused for a different dir. - let cache_key = (pid_key, child_fd); - if let Some((cached_target, entries)) = st.dir_cache.get(&cache_key) { + // Per-process dir cache lookup. + let pp = match pp_handle(processes, pid) { + Some(h) => h, + None => return NotifAction::Continue, + }; + let mut perproc = pp.lock().await; + + // Invalidate stale cache (fd reused for a different directory), + // and short-circuit EOF on a previously fully-drained entry. + if let Some((cached_target, entries)) = perproc.cow_dir_cache.get(&child_fd) { if *cached_target != target { - // fd reused for a different directory — rebuild. - st.dir_cache.remove(&cache_key); + perproc.cow_dir_cache.remove(&child_fd); } else if entries.is_empty() { - // Previously fully drained — return end-of-directory and clean up. - st.dir_cache.remove(&cache_key); + perproc.cow_dir_cache.remove(&child_fd); return NotifAction::ReturnValue(0); } } - if !st.dir_cache.contains_key(&cache_key) { - let cow = st.branch.as_ref().unwrap(); - let merged = cow.list_merged_dir(&rel_path); - - let upper_dir = cow.upper_dir().join(&rel_path); - let lower_dir = cow.workdir().join(&rel_path); - - let mut entries = Vec::new(); - let mut d_off: i64 = 0; - for name in &merged { - d_off += 1; - let upper_p = upper_dir.join(name); - let lower_p = lower_dir.join(name); - let check = if upper_p.exists() || upper_p.is_symlink() { - &upper_p - } else { - &lower_p - }; - let d_type = if check.is_dir() { - DT_DIR - } else if check.is_symlink() { - DT_LNK - } else { - DT_REG + + // Build cache on first call. + if !perproc.cow_dir_cache.contains_key(&child_fd) { + let entries = { + let st = cow_state.lock().await; + let cow = match st.branch.as_ref() { + Some(c) => c, + None => return NotifAction::Continue, }; - use std::os::unix::fs::MetadataExt; - let d_ino = std::fs::symlink_metadata(check) - .map(|m| m.ino()) - .unwrap_or(0); - entries.push(build_dirent64(d_ino, d_off, d_type, name)); - } - st.dir_cache.insert(cache_key, (target.clone(), entries)); + let merged = cow.list_merged_dir(&rel_path); + let upper_dir = cow.upper_dir().join(&rel_path); + let lower_dir = cow.workdir().join(&rel_path); + + let mut out = Vec::new(); + let mut d_off: i64 = 0; + for name in &merged { + d_off += 1; + let upper_p = upper_dir.join(name); + let lower_p = lower_dir.join(name); + let check = if upper_p.exists() || upper_p.is_symlink() { + &upper_p + } else { + &lower_p + }; + let d_type = if check.is_dir() { + DT_DIR + } else if check.is_symlink() { + DT_LNK + } else { + DT_REG + }; + use std::os::unix::fs::MetadataExt; + let d_ino = std::fs::symlink_metadata(check) + .map(|m| m.ino()) + .unwrap_or(0); + out.push(build_dirent64(d_ino, d_off, d_type, name)); + } + out + }; + perproc.cow_dir_cache.insert(child_fd, (target.clone(), entries)); } - let entries = match st.dir_cache.get_mut(&cache_key) { + let entries = match perproc.cow_dir_cache.get_mut(&child_fd) { Some((_, e)) => e, None => return NotifAction::Continue, }; @@ -982,12 +979,7 @@ pub(crate) async fn handle_cow_getdents( if consumed > 0 { entries.drain(..consumed); } - if entries.is_empty() { - // Mark as fully read by leaving an empty entry list in the cache. - // This prevents rebuilding the cache on the next call — the empty - // cache will produce ReturnValue(0) which signals end-of-directory. - } - drop(st); + drop(perproc); if !result.is_empty() { if write_child_mem(notif_fd, notif.id, pid, buf_addr, &result).is_err() { @@ -1006,6 +998,7 @@ pub(crate) async fn handle_cow_getdents( pub(crate) async fn handle_cow_chdir( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let path_ptr = notif.data.args[0]; @@ -1015,31 +1008,31 @@ pub(crate) async fn handle_cow_chdir( }; let orig_path_buf_len = path.len() + 1; // NUL-terminated size in child memory - let mut st = cow_state.lock().await; - let virtual_cwd = current_virtual_cwd(&mut st, notif.pid); - let abs_path = resolve_at_path_with_virtual( + let virtual_cwd = current_virtual_cwd(processes, notif.pid).await; + let resolved = resolve_at_path_with_virtual( notif, libc::AT_FDCWD as i64, &path, virtual_cwd.as_deref(), ); - let cow = match st.branch.as_ref() { - Some(c) => c, - None => return NotifAction::Continue, - }; - - let abs_path = map_cow_upper_path(cow, &abs_path); - if !cow.matches(&abs_path) { - return NotifAction::Continue; - } - // Check if it exists in the upper layer. - let rel = match cow.safe_rel(&abs_path) { - Some(r) => r, - None => return NotifAction::Continue, + let (abs_path, upper_path) = { + let st = cow_state.lock().await; + let cow = match st.branch.as_ref() { + Some(c) => c, + None => return NotifAction::Continue, + }; + let abs_path = map_cow_upper_path(cow, &resolved); + if !cow.matches(&abs_path) { + return NotifAction::Continue; + } + let rel = match cow.safe_rel(&abs_path) { + Some(r) => r, + None => return NotifAction::Continue, + }; + let upper_path = cow.upper_dir().join(&rel); + (abs_path, upper_path) }; - let upper_path = cow.upper_dir().join(&rel); - drop(st); // If the directory exists on the real filesystem, let the kernel handle it. if std::path::Path::new(&abs_path).is_dir() { @@ -1096,10 +1089,17 @@ pub(crate) async fn handle_cow_chdir( return NotifAction::Errno(libc::EFAULT); } - if let Some(pid_key) = cow_pid_key(notif.pid) { - let mut st = cow_state.lock().await; - st.prune_reused_pid(pid_key); - st.virtual_cwds.insert(pid_key, abs_path); + // We insert the virtual cwd here, before returning Continue and + // letting the kernel run the rewritten chdir. We can't observe + // the kernel's verdict without polling, but at this point we've + // verified upper_path is a directory, the addfd ioctl succeeded, + // and write_child_mem rewrote the path argument — so a kernel + // chdir to /proc/self/fd/N is essentially guaranteed. If it does + // somehow fail, the per-child pidfd watcher will drop this entry + // when the process exits, so the inconsistency is bounded by + // process lifetime. + if let Some(pp) = pp_handle(processes, notif.pid) { + pp.lock().await.virtual_cwd = Some(abs_path); } NotifAction::Continue @@ -1109,21 +1109,21 @@ pub(crate) async fn handle_cow_chdir( pub(crate) async fn handle_cow_getcwd( notif: &SeccompNotif, cow_state: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let buf_addr = notif.data.args[0]; let buf_size = (notif.data.args[1] & 0xFFFF_FFFF) as usize; - let mut st = cow_state.lock().await; - let cached_virtual_cwd = current_virtual_cwd(&mut st, notif.pid); - let cow = match st.branch.as_ref() { - Some(c) => c, - None => return NotifAction::Continue, - }; - + let cached_virtual_cwd = current_virtual_cwd(processes, notif.pid).await; let virtual_cwd = if let Some(cwd) = cached_virtual_cwd { cwd } else { + let st = cow_state.lock().await; + let cow = match st.branch.as_ref() { + Some(c) => c, + None => return NotifAction::Continue, + }; let cwd = match std::fs::read_link(format!("/proc/{}/cwd", notif.pid)) { Ok(c) => c, Err(_) => return NotifAction::Continue, @@ -1133,7 +1133,6 @@ pub(crate) async fn handle_cow_getcwd( Err(_) => return NotifAction::Continue, } }; - drop(st); let cwd_bytes = virtual_cwd.as_bytes(); if cwd_bytes.len() + 1 > buf_size { diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index 9878150..f49725a 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use tokio::sync::Mutex; use crate::seccomp::notif::{read_child_cstr, write_child_mem, NotifAction, NotifPolicy}; -use crate::seccomp::state::{NetworkState, ProcfsState}; +use crate::seccomp::state::{NetworkState, ProcessIndex}; use crate::sys::structs::{SeccompNotif, EACCES}; use crate::sys::syscall; @@ -380,7 +380,7 @@ fn read_path(notif: &SeccompNotif, addr: u64, notif_fd: RawFd) -> Option /// - Lets everything else through. pub(crate) async fn handle_proc_open( notif: &SeccompNotif, - procfs: &Arc>, + processes: &Arc, resource: &Arc>, network: &Arc>, policy: &NotifPolicy, @@ -404,8 +404,7 @@ pub(crate) async fn handle_proc_open( // already hide non-sandbox PIDs, but without this check a process // could still open /proc/{ppid}/cmdline (or any guessed PID) directly. if let Some(pid) = extract_proc_pid(&path) { - let pfs = procfs.lock().await; - if !pfs.proc_pids.contains(&pid) { + if !processes.contains(pid) { return NotifAction::Errno(EACCES); } } @@ -435,11 +434,10 @@ pub(crate) async fn handle_proc_open( // Virtualize /proc/loadavg when proc virtualization is active. if path == "/proc/loadavg" { - let pfs = procfs.lock().await; + let total = processes.len() as u32; + let last_pid = processes.max_pid().unwrap_or(0); let rs = resource.lock().await; - let total = pfs.proc_pids.len() as u32; let running = rs.proc_count; - let last_pid = pfs.proc_pids.iter().max().copied().unwrap_or(0); let content = generate_loadavg(&rs.load_avg, running, total, last_pid); return inject_memfd(&content); } @@ -612,7 +610,7 @@ pub(crate) fn handle_etc_hosts_open( /// regardless of filesystem internals. pub(crate) async fn handle_sorted_getdents( notif: &SeccompNotif, - procfs: &Arc>, + processes: &Arc, notif_fd: RawFd, ) -> NotifAction { let pid = notif.pid; @@ -625,16 +623,17 @@ pub(crate) async fn handle_sorted_getdents( Ok(t) => t, Err(_) => return NotifAction::Continue, }; - let cache_key = ( - pid as i32, - child_fd, - dir_path.to_string_lossy().into_owned(), - ); - let mut pfs = procfs.lock().await; + + let entry = match processes.entry_for(pid as i32) { + Some(e) => e, + None => return NotifAction::Continue, + }; + let cache_key = (child_fd, dir_path.to_string_lossy().into_owned()); + let mut perproc = entry.1.lock().await; // Build and cache sorted entries on first call for this open directory. // Remove an empty cache on EOF so later fd reuse can rebuild entries. - if !pfs.getdents_cache.contains_key(&cache_key) { + if !perproc.procfs_dir_cache.contains_key(&cache_key) { let dir = match std::fs::read_dir(&dir_path) { Ok(d) => d, Err(_) => return NotifAction::Continue, @@ -679,17 +678,17 @@ pub(crate) async fn handle_sorted_getdents( }) .collect(); - pfs.getdents_cache.insert(cache_key.clone(), entries); + perproc.procfs_dir_cache.insert(cache_key.clone(), entries); } - let entries = match pfs.getdents_cache.get_mut(&cache_key) { + let entries = match perproc.procfs_dir_cache.get_mut(&cache_key) { Some(e) => e, None => return NotifAction::Continue, }; // Empty cache = already fully drained on a prior call → return 0 (EOF). if entries.is_empty() { - pfs.getdents_cache.remove(&cache_key); + perproc.procfs_dir_cache.remove(&cache_key); return NotifAction::ReturnValue(0); } @@ -708,7 +707,7 @@ pub(crate) async fn handle_sorted_getdents( entries.drain(..consumed); } - drop(pfs); + drop(perproc); if !result.is_empty() { if write_child_mem(notif_fd, notif.id, pid, buf_addr, &result).is_err() { @@ -795,7 +794,7 @@ fn build_filtered_dirents(sandbox_pids: &HashSet) -> Vec> { /// set of entries that hides PIDs not belonging to the sandbox. pub(crate) async fn handle_getdents( notif: &SeccompNotif, - procfs: &Arc>, + processes: &Arc, _policy: &NotifPolicy, notif_fd: RawFd, ) -> NotifAction { @@ -814,16 +813,24 @@ pub(crate) async fn handle_getdents( return NotifAction::Continue; } - let cache_key = (pid as i32, child_fd, target.to_string_lossy().into_owned()); - let mut pfs = procfs.lock().await; + let entry = match processes.entry_for(pid as i32) { + Some(e) => e, + None => return NotifAction::Continue, + }; + let cache_key = (child_fd, target.to_string_lossy().into_owned()); + let mut perproc = entry.1.lock().await; - // Build and cache entries on first call for this (pid, fd) pair. - if !pfs.getdents_cache.contains_key(&cache_key) { - let entries = build_filtered_dirents(&pfs.proc_pids); - pfs.getdents_cache.insert(cache_key.clone(), entries); + // Build and cache entries on first call for this (fd, target) pair. + if !perproc.procfs_dir_cache.contains_key(&cache_key) { + // Snapshot sandbox PIDs without holding the per-process lock + // any longer than needed — pids_snapshot only takes the + // ProcessIndex read lock briefly. + let snapshot = processes.pids_snapshot(); + let entries = build_filtered_dirents(&snapshot); + perproc.procfs_dir_cache.insert(cache_key.clone(), entries); } - let entries = match pfs.getdents_cache.get_mut(&cache_key) { + let entries = match perproc.procfs_dir_cache.get_mut(&cache_key) { Some(e) => e, None => return NotifAction::Continue, }; @@ -841,7 +848,7 @@ pub(crate) async fn handle_getdents( // Empty cache = already fully drained on a prior call → return 0 (EOF). if entries.is_empty() { - pfs.getdents_cache.remove(&cache_key); + perproc.procfs_dir_cache.remove(&cache_key); return NotifAction::ReturnValue(0); } @@ -849,7 +856,7 @@ pub(crate) async fn handle_getdents( entries.drain(..consumed); } - drop(pfs); + drop(perproc); // Write the result into the child's buffer and return the byte count. if !result.is_empty() { diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs index cab2be8..00867ca 100644 --- a/crates/sandlock-core/src/resource.rs +++ b/crates/sandlock-core/src/resource.rs @@ -3,8 +3,9 @@ use std::sync::Arc; use tokio::sync::Mutex; -use crate::seccomp::notif::{NotifAction, NotifPolicy}; -use crate::seccomp::state::{ProcfsState, ResourceState}; +use crate::seccomp::ctx::SupervisorCtx; +use crate::seccomp::notif::{spawn_pid_watcher, NotifAction, NotifPolicy}; +use crate::seccomp::state::ResourceState; use crate::sys::structs::{ SeccompNotif, CLONE_NS_FLAGS, EAGAIN, EPERM, }; @@ -17,13 +18,17 @@ const MAP_ANONYMOUS: u64 = 0x20; /// Handle fork/clone/vfork notifications. /// -/// Enforces namespace creation ban, process limits, and checkpoint hold. -/// Needs both `ResourceState` (for proc_count, hold_forks, etc.) and -/// `ProcfsState` (for proc_pids). +/// Enforces namespace creation ban and process limits, registers the +/// new child in `ProcessIndex` (with an owned pidfd), and spawns a +/// per-child pidfd watcher that runs unified cleanup on exit. +/// +/// Note: `notif.pid` here is the *parent* (the task issuing +/// clone/fork). The kernel hasn't run the syscall yet, so we don't +/// know the child's pid. The child is discovered and registered later, +/// on its first own seccomp notification, via `register_child_if_new`. pub(crate) async fn handle_fork( notif: &SeccompNotif, resource: &Arc>, - procfs: &Arc>, _policy: &NotifPolicy, ) -> NotifAction { let nr = notif.data.nr as i64; @@ -55,12 +60,39 @@ pub(crate) async fn handle_fork( } rs.proc_count += 1; - drop(rs); + NotifAction::Continue +} + +/// If `notif.pid` is not yet tracked in the ProcessIndex, register +/// it: open a pidfd, record the canonical PidKey, and spawn the exit +/// watcher. Called from the supervisor's notification dispatcher +/// before per-syscall handlers run, so handlers can rely on +/// `ProcessIndex::key_for(notif.pid)` returning a fresh PidKey. +/// +/// The fast path is a single `RwLock` read: if the pid is already +/// tracked, we trust the entry. PID-identity correctness comes from +/// the per-child pidfd watcher — a process can't issue notifications +/// after it has exited, and the kernel won't recycle a PID until the +/// parent has waited (which we observe), so a stale entry has no +/// window in which to be hit. We deliberately do *not* re-stat +/// /proc//stat on every notification. +pub(crate) async fn register_child_if_new(ctx: &Arc, pid: i32) { + if ctx.processes.contains(pid) { + return; + } - let mut pfs = procfs.lock().await; - pfs.proc_pids.insert(notif.pid as i32); + let pidfd = match crate::sys::syscall::pidfd_open(pid as u32, 0) { + Ok(fd) => fd, + Err(_) => return, // old kernel or process gone — GC backstop will clean up + }; - NotifAction::Continue + let key = match ctx.processes.register(pid) { + Some(k) => k, + None => return, // process exited between pidfd_open and stat read + }; + + // Hand the pidfd to the watcher; it owns the fd's lifetime now. + spawn_pid_watcher(Arc::clone(ctx), key, pidfd); } /// Handle wait4/waitid notifications — decrement the concurrent process count. @@ -82,14 +114,14 @@ pub(crate) async fn handle_wait( /// Tracks anonymous memory usage and enforces the configured memory limit. pub(crate) async fn handle_memory( notif: &SeccompNotif, - resource: &Arc>, + ctx: &Arc, policy: &NotifPolicy, ) -> NotifAction { let nr = notif.data.nr as i64; let args = ¬if.data.args; let limit = policy.max_memory_bytes; - let mut st = resource.lock().await; + let mut st = ctx.resource.lock().await; let kill = NotifAction::Kill { sig: libc::SIGKILL, pgid: notif.pid as i32 }; @@ -110,26 +142,36 @@ pub(crate) async fn handle_memory( } else if nr == libc::SYS_brk { // args[0] = new_brk let new_brk = args[0]; - let pid = notif.pid as i32; if new_brk == 0 { // Query: return Continue, kernel handles it. return NotifAction::Continue; } - let base = *st.brk_bases.entry(pid).or_insert(new_brk); - + // Per-process brk base is in PerProcessState. Drop the global + // ResourceState lock first to avoid lock ordering issues with + // the per-process lock acquired below (per-process first, + // then global, when both are needed). + drop(st); + let entry = match ctx.processes.entry_for(notif.pid as i32) { + Some(e) => e, + None => return NotifAction::Continue, + }; + let mut perproc = entry.1.lock().await; + let mut st = ctx.resource.lock().await; + + let base = *perproc.brk_base.get_or_insert(new_brk); if new_brk > base { let delta = new_brk - base; if st.mem_used.saturating_add(delta) > limit { return kill; } st.mem_used += delta; - st.brk_bases.insert(pid, new_brk); + perproc.brk_base = Some(new_brk); } else if new_brk < base { let delta = base - new_brk; st.mem_used = st.mem_used.saturating_sub(delta); - st.brk_bases.insert(pid, new_brk); + perproc.brk_base = Some(new_brk); } } else if nr == libc::SYS_mremap { // args[1] = old_len, args[2] = new_len diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 81dead1..190dc2f 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -957,9 +957,8 @@ impl Sandbox { net_state.port_map.on_bind = Some(cb); } - // ProcfsState - let mut procfs_state = ProcfsState::new(); - procfs_state.proc_pids.insert(pid); + // ProcfsState (sandbox membership lives in ProcessIndex now). + let procfs_state = ProcfsState::new(); // ResourceState let mut res_state = ResourceState::new( @@ -1029,6 +1028,9 @@ impl Sandbox { let time_random_state = Arc::new(Mutex::new(time_random_state)); let policy_fn_state = Arc::new(Mutex::new(policy_fn_state)); let chroot_state = Arc::new(Mutex::new(chroot_state)); + // Root child is registered (with watcher) on its first + // notification, the same path grandchildren take. + let processes = Arc::new(crate::seccomp::state::ProcessIndex::new()); let ctx = Arc::new(SupervisorCtx { resource: Arc::clone(&res_state), @@ -1039,6 +1041,7 @@ impl Sandbox { policy_fn: Arc::clone(&policy_fn_state), chroot: Arc::clone(&chroot_state), netlink: Arc::new(crate::netlink::NetlinkState::new()), + processes: Arc::clone(&processes), policy: Arc::new(notif_policy), child_pidfd: child_pidfd_raw, notif_fd: notif_raw_fd, diff --git a/crates/sandlock-core/src/seccomp/ctx.rs b/crates/sandlock-core/src/seccomp/ctx.rs index e415ae9..bb1be30 100644 --- a/crates/sandlock-core/src/seccomp/ctx.rs +++ b/crates/sandlock-core/src/seccomp/ctx.rs @@ -3,7 +3,10 @@ use std::sync::Arc; use tokio::sync::Mutex; use super::notif::NotifPolicy; -use super::state::{ChrootState, CowState, NetworkState, PolicyFnState, ProcfsState, ResourceState, TimeRandomState}; +use super::state::{ + ChrootState, CowState, NetworkState, PolicyFnState, ProcessIndex, ProcfsState, ResourceState, + TimeRandomState, +}; /// Holds all supervisor state and policy. Passed to every handler. pub struct SupervisorCtx { @@ -23,6 +26,11 @@ pub struct SupervisorCtx { pub chroot: Arc>, /// NETLINK_ROUTE virtualization state. pub netlink: Arc, + /// Per-process registry: pid → PidKey. Source of truth for + /// "which processes are in the sandbox" and the anchor for + /// unified per-process state cleanup. Wraps an internal RwLock, + /// so handlers can query it synchronously without `.await`. + pub processes: Arc, /// Immutable policy — no lock needed. pub policy: Arc, /// pidfd for the child process (immutable after spawn). diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index 8e1479f..34e1178 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -103,12 +103,11 @@ pub fn build_dispatch_table( for nr in fork_nrs { let policy = Arc::clone(policy); let resource = Arc::clone(resource); - table.register(nr, Box::new(move |notif, ctx, _notif_fd| { + table.register(nr, Box::new(move |notif, _ctx, _notif_fd| { let policy = Arc::clone(&policy); let resource = Arc::clone(&resource); - let procfs_inner = Arc::clone(&ctx.procfs); Box::pin(async move { - crate::resource::handle_fork(¬if, &resource, &procfs_inner, &policy).await + crate::resource::handle_fork(¬if, &resource, &policy).await }) })); } @@ -135,12 +134,10 @@ pub fn build_dispatch_table( libc::SYS_mremap, libc::SYS_shmget, ] { let policy = Arc::clone(policy); - let resource = Arc::clone(resource); - table.register(nr, Box::new(move |notif, _ctx, _notif_fd| { + table.register(nr, Box::new(move |notif, ctx, _notif_fd| { let policy = Arc::clone(&policy); - let resource = Arc::clone(&resource); Box::pin(async move { - crate::resource::handle_memory(¬if, &resource, &policy).await + crate::resource::handle_memory(¬if, &ctx, &policy).await }) })); } @@ -233,10 +230,10 @@ pub fn build_dispatch_table( table.register(libc::SYS_openat, Box::new(move |notif, ctx, notif_fd| { let policy = Arc::clone(&policy); let resource = Arc::clone(&resource); - let procfs_inner = Arc::clone(&ctx.procfs); + let processes = Arc::clone(&ctx.processes); let network = Arc::clone(&ctx.network); Box::pin(async move { - crate::procfs::handle_proc_open(¬if, &procfs_inner, &resource, &network, &policy, notif_fd).await + crate::procfs::handle_proc_open(¬if, &processes, &resource, &network, &policy, notif_fd).await }) })); } @@ -248,9 +245,9 @@ pub fn build_dispatch_table( let policy = Arc::clone(policy); table.register(nr, Box::new(move |notif, ctx, notif_fd| { let policy = Arc::clone(&policy); - let procfs_inner = Arc::clone(&ctx.procfs); + let processes = Arc::clone(&ctx.processes); Box::pin(async move { - crate::procfs::handle_getdents(¬if, &procfs_inner, &policy, notif_fd).await + crate::procfs::handle_getdents(¬if, &processes, &policy, notif_fd).await }) })); } @@ -317,9 +314,9 @@ pub fn build_dispatch_table( } for nr in getdents_nrs { table.register(nr, Box::new(|notif, ctx, notif_fd| { - let procfs_inner = Arc::clone(&ctx.procfs); + let processes = Arc::clone(&ctx.processes); Box::pin(async move { - crate::procfs::handle_sorted_getdents(¬if, &procfs_inner, notif_fd).await + crate::procfs::handle_sorted_getdents(¬if, &processes, notif_fd).await }) })); } @@ -606,6 +603,17 @@ fn register_chroot_handlers(table: &mut DispatchTable, policy: &Arc // ============================================================ fn register_cow_handlers(table: &mut DispatchTable) { + // Helper to grab cow + processes from ctx in one place. + macro_rules! cow_call { + ($handler:expr) => { + Box::new(|notif, ctx, notif_fd| { + let cow = Arc::clone(&ctx.cow); + let processes = Arc::clone(&ctx.processes); + Box::pin(async move { $handler(¬if, &cow, &processes, notif_fd).await }) + }) + }; + } + // Write syscalls (*at variants + legacy) let mut write_nrs = vec![ libc::SYS_unlinkat, libc::SYS_mkdirat, libc::SYS_renameat2, @@ -618,108 +626,43 @@ fn register_cow_handlers(table: &mut DispatchTable) { arch::SYS_LCHOWN, ].into_iter().flatten()); for nr in write_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_write(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_write)); } - // utimensat — unconditional return - table.register(libc::SYS_utimensat, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_utimensat(¬if, &cow, notif_fd).await - }) - })); + table.register(libc::SYS_utimensat, cow_call!(crate::cow::dispatch::handle_cow_utimensat)); - // faccessat/access — fallthrough - let mut access_nrs = vec![ - libc::SYS_faccessat, - crate::cow::dispatch::SYS_FACCESSAT2, - ]; + let mut access_nrs = vec![libc::SYS_faccessat, crate::cow::dispatch::SYS_FACCESSAT2]; access_nrs.extend(arch::SYS_ACCESS); for nr in access_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_access(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_access)); } - // openat/open — fallthrough let mut open_nrs = vec![libc::SYS_openat]; open_nrs.extend(arch::SYS_OPEN); for nr in open_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_open(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_open)); } - // stat family — fallthrough - let mut stat_nrs = vec![ - libc::SYS_newfstatat, libc::SYS_faccessat, - ]; + let mut stat_nrs = vec![libc::SYS_newfstatat, libc::SYS_faccessat]; stat_nrs.extend([arch::SYS_STAT, arch::SYS_LSTAT, arch::SYS_ACCESS].into_iter().flatten()); for nr in stat_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_stat(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_stat)); } - // statx — fallthrough - table.register(libc::SYS_statx, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_statx(¬if, &cow, notif_fd).await - }) - })); + table.register(libc::SYS_statx, cow_call!(crate::cow::dispatch::handle_cow_statx)); - // readlink — fallthrough let mut readlink_nrs = vec![libc::SYS_readlinkat]; readlink_nrs.extend(arch::SYS_READLINK); for nr in readlink_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_readlink(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_readlink)); } - // getdents — fallthrough let mut getdents_nrs = vec![libc::SYS_getdents64]; getdents_nrs.extend(arch::SYS_GETDENTS); for nr in getdents_nrs { - table.register(nr, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_getdents(¬if, &cow, notif_fd).await - }) - })); + table.register(nr, cow_call!(crate::cow::dispatch::handle_cow_getdents)); } - // chdir — redirect to upper dir if target was created by COW - table.register(libc::SYS_chdir, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_chdir(¬if, &cow, notif_fd).await - }) - })); - - // getcwd — return logical workdir path after chdir into a COW-only dir - table.register(libc::SYS_getcwd, Box::new(|notif, ctx, notif_fd| { - let cow = Arc::clone(&ctx.cow); - Box::pin(async move { - crate::cow::dispatch::handle_cow_getcwd(¬if, &cow, notif_fd).await - }) - })); + table.register(libc::SYS_chdir, cow_call!(crate::cow::dispatch::handle_cow_chdir)); + table.register(libc::SYS_getcwd, cow_call!(crate::cow::dispatch::handle_cow_getcwd)); } diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 03a1981..71c4e34 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -862,6 +862,12 @@ async fn handle_notification( ) { let policy = &ctx.policy; + // Ensure every pid that produces a notification is tracked in the + // ProcessIndex with an exit watcher. The fork handler runs on the + // *parent* pid (the child doesn't exist yet at clone-time), so the + // child gets registered the first time it issues its own syscall. + crate::resource::register_child_if_new(ctx, notif.pid as i32).await; + // Re-patch vDSO if needed (exec replaces it with a fresh copy). if policy.has_time_start || policy.has_random_seed { let mut pfs = ctx.procfs.lock().await; @@ -950,9 +956,79 @@ pub async fn supervisor( } }); + // Periodic sweep as a defensive backstop in case pidfd-based + // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a + // child on an old kernel, or its watcher panicked). At 5 minutes + // this is cheap enough to leave on; the primary cleanup path is + // still per-child pidfd readiness in `spawn_pid_watcher`. + let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes))); + while let Some(notif) = rx.recv().await { handle_notification(notif, &ctx, &dispatch_table, fd).await; } + + gc.abort(); +} + +/// Periodic sweep that drops `ProcessIndex` entries for exited PIDs. +/// Per-process state hangs off these entries via `Arc`, so dropping +/// them releases everything in one step. +async fn process_index_gc(processes: Arc) { + let interval = std::time::Duration::from_secs(300); + loop { + tokio::time::sleep(interval).await; + if processes.len() == 0 { + continue; + } + processes.prune_dead(); + } +} + +/// Spawn a per-child task that awaits the pidfd becoming readable +/// (process exit) and then runs unified cleanup across every +/// per-process supervisor map. +/// +/// The watcher *owns* the pidfd via `AsyncFd` — the kernel +/// fd stays alive for as long as tokio's IO driver has it registered, +/// and is closed exactly once when the watcher task ends. This avoids +/// a TOCTOU where dropping the fd from a separate map could let a +/// recycled fd be deregistered from epoll. +pub(crate) fn spawn_pid_watcher( + ctx: Arc, + key: super::state::PidKey, + pidfd: std::os::unix::io::OwnedFd, +) { + tokio::spawn(async move { + let async_fd = match tokio::io::unix::AsyncFd::with_interest( + pidfd, + tokio::io::Interest::READABLE, + ) { + Ok(f) => f, + Err(_) => { + // AsyncFd registration failed (extremely unusual); + // fall back to immediate cleanup so we don't leak the + // index entry. The OwnedFd we passed in is consumed + // by `with_interest`'s Err return and will close on + // drop here. + cleanup_pid(&ctx, key).await; + return; + } + }; + // pidfd becomes readable when the process exits; we don't + // read any data, so `readable()` is just an await point. + let _ = async_fd.readable().await; + cleanup_pid(&ctx, key).await; + // async_fd drops here, closing the pidfd. + }); +} + +/// Drop the supervisor's per-process state for `key`. With every +/// per-process map living inside `PerProcessState` (owned by +/// `ProcessIndex`), this is a single unregister — the entry's `Arc` +/// drops here, and remaining clones held by in-flight handlers will +/// drop with their tasks, freeing `PerProcessState` automatically. +pub(crate) async fn cleanup_pid(ctx: &super::ctx::SupervisorCtx, key: super::state::PidKey) { + ctx.processes.unregister(key); } // ============================================================ @@ -997,7 +1073,6 @@ mod tests { assert_eq!(rs.mem_used, 0); assert_eq!(rs.max_memory_bytes, 1024 * 1024); assert_eq!(rs.max_processes, 10); - assert!(rs.brk_bases.is_empty()); assert!(!rs.hold_forks); assert!(rs.held_notif_ids.is_empty()); } diff --git a/crates/sandlock-core/src/seccomp/state.rs b/crates/sandlock-core/src/seccomp/state.rs index b9058e2..d81f207 100644 --- a/crates/sandlock-core/src/seccomp/state.rs +++ b/crates/sandlock-core/src/seccomp/state.rs @@ -1,7 +1,11 @@ // Domain-specific state structs — each domain is locked independently so -// handlers only contend on the state they actually need. +// handlers only contend on the state they actually need. Per-process +// state is bundled into a single `PerProcessState` owned by +// `ProcessIndex`; cleanup on exit is just dropping the entry's `Arc`. use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use tokio::sync::Mutex as AsyncMutex; /// Resource-limit runtime state shared across notification handlers. pub struct ResourceState { @@ -13,8 +17,6 @@ pub struct ResourceState { pub mem_used: u64, /// Maximum allowed anonymous memory (bytes). pub max_memory_bytes: u64, - /// Per-PID brk base addresses for memory tracking. - pub brk_bases: HashMap, /// Whether fork notifications should be held (checkpoint/freeze). pub hold_forks: bool, /// Notification IDs held during a checkpoint freeze. @@ -33,7 +35,6 @@ impl ResourceState { max_processes, mem_used: 0, max_memory_bytes, - brk_bases: HashMap::new(), hold_forks: false, held_notif_ids: Vec::new(), load_avg: crate::procfs::LoadAvg::new(), @@ -46,13 +47,11 @@ impl ResourceState { // ProcfsState — /proc virtualization state // ============================================================ -/// /proc virtualization runtime state. +/// /proc virtualization runtime state. Sandbox membership lives in +/// `ProcessIndex`; per-process getdents caches live in +/// `PerProcessState::procfs_dir_cache`. This struct only holds +/// truly global virtualization state. pub struct ProcfsState { - /// PIDs belonging to the sandbox (for /proc PID filtering). - pub proc_pids: HashSet, - /// Cache of filtered dirent entries keyed by (pid, fd, directory target). - /// Populated on first getdents64 call for a /proc directory, drained on subsequent calls. - pub getdents_cache: HashMap<(i32, u32, String), Vec>>, /// Base address of the last vDSO we patched (0 = not yet patched). pub vdso_patched_addr: u64, } @@ -60,18 +59,18 @@ pub struct ProcfsState { impl ProcfsState { pub fn new() -> Self { Self { - proc_pids: HashSet::new(), - getdents_cache: HashMap::new(), vdso_patched_addr: 0, } } } // ============================================================ -// CowState — copy-on-write filesystem state +// PidKey — stable per-process identity // ============================================================ -/// Stable process identity for per-process COW state. +/// Stable process identity. Numeric pid plus the start_time that +/// distinguishes a specific process instance from any future recycle +/// of the same pid slot. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct PidKey { /// Numeric PID observed by seccomp notification. @@ -80,57 +79,209 @@ pub struct PidKey { pub start_time: u64, } -/// Copy-on-write filesystem state. -pub struct CowState { - /// Seccomp-based COW branch (None if COW disabled). - pub branch: Option, - /// Getdents cache for COW directories. - /// Value is (host_path, entries) to detect fd reuse and invalidate stale entries. - pub dir_cache: HashMap<(PidKey, u32), (String, Vec>)>, - /// Logical cwd for processes that chdir into COW-only directories. - pub virtual_cwds: HashMap, +/// Read the process start time (field 22 of /proc//stat) for `pid`. +/// Returns None if the process is gone or /proc is not readable. +pub(crate) fn read_pid_start_time(pid: i32) -> Option { + let stat = std::fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?; + // Skip past "pid (comm)" — comm may contain spaces and parens, but the + // last ") " in the line ends the comm field. + let rest = stat.rsplit_once(") ")?.1; + // The first token after "(comm) " is field 3; field 22 is therefore nth(19). + rest.split_whitespace().nth(19)?.parse().ok() } -impl CowState { +// ============================================================ +// PerProcessState — bundled per-process supervisor state +// ============================================================ + +/// All per-process supervisor state for one tracked child. One +/// instance lives per `PidKey`, owned by `ProcessIndex` behind an +/// `Arc>`. Cleanup on process exit is one operation: +/// `ProcessIndex::unregister` drops the index's `Arc`, and the +/// supervisor's per-handler clones drop along with their tasks. +#[derive(Default)] +pub struct PerProcessState { + /// Logical cwd while the process is chdir'd into a COW-only + /// directory. None means "use kernel-reported cwd". + pub virtual_cwd: Option, + /// Recorded brk base for memory accounting. None until first brk. + pub brk_base: Option, + /// COW directory dirent cache. Keyed by child's fd; value is + /// (host target path, sorted dirent bytes left to return). + /// Entries are invalidated when the fd is reused for a different + /// directory. + pub cow_dir_cache: HashMap>)>, + /// /proc directory dirent cache. Keyed by (child fd, target + /// path); same drain-on-EOF semantics as cow_dir_cache. + pub procfs_dir_cache: HashMap<(u32, String), Vec>>, +} + +// ============================================================ +// ProcessIndex — sandbox membership + per-process state +// ============================================================ + +/// Source-of-truth registry for processes inside the sandbox. +/// +/// Maps the kernel's numeric `pid` (the value that arrives in seccomp +/// notifications) to the canonical `PidKey` plus an +/// `Arc>` holding everything per-process. +/// Held behind an internal `std::sync::RwLock` so the read-mostly hot +/// paths (`key_for`, `contains`, `entry_for`, `/proc` virtualization) +/// avoid an async mutex on every notification, and so `ProcessIndex` +/// doesn't need its own outer wrapper in `SupervisorCtx`. Lock guards +/// are `!Send` and the compiler will reject holding one across an +/// `.await`, which keeps callers honest. +/// +/// Ownership of each child's pidfd lives with the per-child watcher +/// task, not with this index. That keeps the kernel fd alive for as +/// long as the `AsyncFd` registration in the tokio IO driver does, +/// and avoids a race where dropping the fd from the index could +/// deregister a recycled fd from epoll. +pub struct ProcessIndex { + inner: std::sync::RwLock>, +} + +#[derive(Clone)] +struct ProcessEntry { + key: PidKey, + state: Arc>, +} + +impl ProcessIndex { pub fn new() -> Self { Self { - branch: None, - dir_cache: HashMap::new(), - virtual_cwds: HashMap::new(), + inner: std::sync::RwLock::new(HashMap::new()), + } + } + + /// Register a process by reading its start_time once and + /// allocating its `PerProcessState`. Returns the canonical key, + /// or None if the process is already gone. The caller is + /// responsible for keeping the pidfd alive — the per-child + /// watcher task does this via `AsyncFd`. + pub fn register(&self, pid: i32) -> Option { + let start_time = read_pid_start_time(pid)?; + let key = PidKey { pid, start_time }; + let entry = ProcessEntry { + key, + state: Arc::new(AsyncMutex::new(PerProcessState::default())), + }; + self.inner.write().ok()?.insert(pid, entry); + Some(key) + } + + /// Look up the canonical PidKey for a notification's raw pid. + /// Returns None if this pid was never registered (e.g. pidfd_open + /// failed at fork) — callers should fall back to a no-op. + pub fn key_for(&self, pid: i32) -> Option { + self.inner.read().ok()?.get(&pid).map(|e| e.key) + } + + /// Look up both the PidKey and the per-process state handle for + /// `pid`. Returns None if the pid isn't tracked. The caller locks + /// the returned `Arc>` to read or mutate. + pub fn entry_for(&self, pid: i32) -> Option<(PidKey, Arc>)> { + self.inner + .read() + .ok()? + .get(&pid) + .map(|e| (e.key, Arc::clone(&e.state))) + } + + /// Cheap membership test — used by /proc virtualization to gate + /// access to `/proc//...` paths and by getdents filtering. + pub fn contains(&self, pid: i32) -> bool { + self.inner + .read() + .map(|g| g.contains_key(&pid)) + .unwrap_or(false) + } + + /// Number of tracked processes (for /proc/loadavg total). + pub fn len(&self) -> usize { + self.inner.read().map(|g| g.len()).unwrap_or(0) + } + + /// Largest tracked pid (for /proc/loadavg last_pid). + pub fn max_pid(&self) -> Option { + self.inner.read().ok()?.keys().copied().max() + } + + /// Snapshot the set of tracked pids. Used by getdents filtering + /// where the caller needs O(1) lookups inside a loop and would + /// otherwise have to re-acquire the read lock per entry. + pub fn pids_snapshot(&self) -> HashSet { + self.inner + .read() + .map(|g| g.keys().copied().collect()) + .unwrap_or_default() + } + + /// Remove a process from the index. The per-process state's + /// `Arc` reference held by the index drops here; remaining clones + /// (e.g. a handler that's mid-execution for that pid) will drop + /// when they go out of scope, and the inner `PerProcessState` + /// frees automatically. + pub fn unregister(&self, key: PidKey) { + if let Ok(mut g) = self.inner.write() { + // Only clear if the entry still points at this key. A PID + // recycled with a fresh start_time may already have + // overwritten the entry via register(); we must not stomp it. + if g.get(&key.pid).map(|e| e.key) == Some(key) { + g.remove(&key.pid); + } } } - /// Drop COW per-process entries for an older process that used the same numeric PID. - pub(crate) fn prune_reused_pid(&mut self, current: PidKey) { - self.virtual_cwds - .retain(|key, _| key.pid != current.pid || *key == current); - self.dir_cache - .retain(|(key, _), _| key.pid != current.pid || *key == current); + /// Defensive sweep: drop entries whose process is gone (or whose + /// start_time has changed). Called from a low-frequency backstop + /// task in case a pidfd watcher failed to spawn or the kernel + /// didn't deliver the readability event. + pub fn prune_dead(&self) { + let candidates: Vec<(i32, PidKey)> = match self.inner.read() { + Ok(g) => g.iter().map(|(p, e)| (*p, e.key)).collect(), + Err(_) => return, + }; + let mut dead = Vec::new(); + for (pid, key) in candidates { + match read_pid_start_time(pid) { + Some(st) if st == key.start_time => continue, + _ => dead.push(key), + } + } + if dead.is_empty() { + return; + } + if let Ok(mut g) = self.inner.write() { + for key in dead { + if g.get(&key.pid).map(|e| e.key) == Some(key) { + g.remove(&key.pid); + } + } + } } } -#[cfg(test)] -mod tests { - use super::*; +impl Default for ProcessIndex { + fn default() -> Self { + Self::new() + } +} - #[test] - fn cow_state_prunes_entries_for_reused_pid() { - let old = PidKey { pid: 42, start_time: 1 }; - let current = PidKey { pid: 42, start_time: 2 }; - let other = PidKey { pid: 43, start_time: 1 }; - let mut state = CowState::new(); - - state.virtual_cwds.insert(old, "/old".to_string()); - state.virtual_cwds.insert(other, "/other".to_string()); - state.dir_cache.insert((old, 7), ("/old".to_string(), Vec::new())); - state.dir_cache.insert((other, 7), ("/other".to_string(), Vec::new())); - - state.prune_reused_pid(current); - - assert!(!state.virtual_cwds.contains_key(&old)); - assert!(!state.dir_cache.contains_key(&(old, 7))); - assert_eq!(state.virtual_cwds.get(&other), Some(&"/other".to_string())); - assert!(state.dir_cache.contains_key(&(other, 7))); +// ============================================================ +// CowState — copy-on-write filesystem state (global only) +// ============================================================ + +/// Global COW state. Per-process COW state (virtual cwd, dir cache) +/// lives in `PerProcessState`. +pub struct CowState { + /// Seccomp-based COW branch (None if COW disabled). + pub branch: Option, +} + +impl CowState { + pub fn new() -> Self { + Self { branch: None } } } @@ -176,13 +327,11 @@ impl NetworkState { pid: u32, live_policy: Option<&std::sync::Arc>>, ) -> crate::seccomp::notif::NetworkPolicy { - // Per-PID override takes priority if let Ok(overrides) = self.pid_ip_overrides.read() { if let Some(ips) = overrides.get(&pid) { return crate::seccomp::notif::NetworkPolicy::AllowList(ips.clone()); } } - // Live policy (dynamic updates from policy_fn) if let Some(lp) = live_policy { if let Ok(live) = lp.read() { if !live.allowed_ips.is_empty() { @@ -190,7 +339,6 @@ impl NetworkState { } } } - // Global policy self.network_policy.clone() } } @@ -263,3 +411,115 @@ impl ChrootState { Self { chroot_exe: None } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn process_index_register_lookup_unregister() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + let key = idx + .register(self_pid) + .expect("register should succeed for live pid"); + assert_eq!(key.pid, self_pid); + + assert_eq!(idx.key_for(self_pid), Some(key)); + assert!(idx.contains(self_pid)); + assert_eq!(idx.key_for(self_pid + 999_999), None); + assert!(!idx.contains(self_pid + 999_999)); + assert_eq!(idx.len(), 1); + assert_eq!(idx.max_pid(), Some(self_pid)); + + idx.unregister(key); + assert_eq!(idx.key_for(self_pid), None); + assert!(!idx.contains(self_pid)); + assert_eq!(idx.len(), 0); + assert_eq!(idx.max_pid(), None); + } + + #[test] + fn process_index_register_overwrites_stale_entry_for_recycled_pid() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + // Forge a stale entry by direct insertion under the lock. + { + let stale_key = PidKey { pid: self_pid, start_time: 0 }; + let stale = ProcessEntry { + key: stale_key, + state: Arc::new(AsyncMutex::new(PerProcessState::default())), + }; + idx.inner.write().unwrap().insert(self_pid, stale); + } + + let new_key = idx.register(self_pid).unwrap(); + assert_ne!(new_key.start_time, 0); + assert_eq!(idx.key_for(self_pid), Some(new_key)); + + // Unregistering by the stale key must NOT clobber the fresh + // registration; only an exact-match unregister wins. + let stale_key = PidKey { pid: self_pid, start_time: 0 }; + idx.unregister(stale_key); + assert_eq!(idx.key_for(self_pid), Some(new_key)); + } + + #[tokio::test] + async fn process_index_entry_for_returns_shared_handle() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + let key = idx.register(self_pid).unwrap(); + + let (k1, s1) = idx.entry_for(self_pid).unwrap(); + let (k2, s2) = idx.entry_for(self_pid).unwrap(); + assert_eq!(k1, key); + assert_eq!(k2, key); + + // Two clones of the same Arc — writes through one are visible + // through the other. + s1.lock().await.brk_base = Some(0xdead_beef); + assert_eq!(s2.lock().await.brk_base, Some(0xdead_beef)); + + // After unregister, entry_for returns None but existing Arc + // clones stay valid (kept alive by callers). + idx.unregister(key); + assert!(idx.entry_for(self_pid).is_none()); + assert_eq!(s1.lock().await.brk_base, Some(0xdead_beef)); + } + + #[test] + fn process_index_pids_snapshot_is_independent() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + let key = idx.register(self_pid).unwrap(); + let snap = idx.pids_snapshot(); + idx.unregister(key); + assert!(snap.contains(&self_pid)); + assert!(!idx.contains(self_pid)); + } + + #[test] + fn process_index_prune_dead_drops_recycled_entries() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + // Insert a stale entry for self with a wrong start_time. + let stale_key = PidKey { pid: self_pid, start_time: 0 }; + let stale = ProcessEntry { + key: stale_key, + state: Arc::new(AsyncMutex::new(PerProcessState::default())), + }; + idx.inner.write().unwrap().insert(self_pid, stale); + + idx.prune_dead(); + assert!(!idx.contains(self_pid)); + } + + #[test] + fn process_index_prune_dead_keeps_live_entries() { + let self_pid = unsafe { libc::getpid() }; + let idx = ProcessIndex::new(); + let key = idx.register(self_pid).unwrap(); + idx.prune_dead(); + assert_eq!(idx.key_for(self_pid), Some(key)); + } +}