diff --git a/crates/sandlock-cli/src/main.rs b/crates/sandlock-cli/src/main.rs index a6e4750..f31937b 100644 --- a/crates/sandlock-cli/src/main.rs +++ b/crates/sandlock-cli/src/main.rs @@ -671,7 +671,8 @@ fn no_supervisor_exec(policy: &Policy, cmd: &[&str]) -> Result<()> { // 2. Install deny-only seccomp filter (blocks dangerous syscalls without supervisor) let deny_nrs = sandlock_core::context::no_supervisor_deny_syscall_numbers(); - let filter = sandlock_core::seccomp::bpf::assemble_filter(&[], &deny_nrs, &[]); + let filter = sandlock_core::seccomp::bpf::assemble_filter(&[], &deny_nrs, &[]) + .map_err(|e| anyhow!("seccomp assemble failed: {}", e))?; sandlock_core::seccomp::bpf::install_deny_filter(&filter) .map_err(|e| anyhow!("seccomp deny filter failed: {}", e))?; diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index 8eeec19..389ab28 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -853,7 +853,10 @@ pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, if nested { // Nested sandbox: deny-only filter (no supervisor — parent handles it). // BPF filters are ANDed by the kernel, so each level can only tighten. - let filter = bpf::assemble_filter(&[], &deny, &args); + let filter = match bpf::assemble_filter(&[], &deny, &args) { + Ok(f) => f, + Err(e) => fail!(format!("seccomp assemble: {}", e)), + }; if let Err(e) = bpf::install_deny_filter(&filter) { fail!(format!("seccomp deny filter: {}", e)); } @@ -864,7 +867,10 @@ pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair, } else { // First-level sandbox: notif + deny filter with NEW_LISTENER. let notif = notif_syscalls(policy); - let filter = bpf::assemble_filter(¬if, &deny, &args); + let filter = match bpf::assemble_filter(¬if, &deny, &args) { + Ok(f) => f, + Err(e) => fail!(format!("seccomp assemble: {}", e)), + }; let notif_fd = match bpf::install_filter(&filter) { Ok(fd) => fd, Err(e) => fail!(format!("seccomp install: {}", e)), diff --git a/crates/sandlock-core/src/cow/dispatch.rs b/crates/sandlock-core/src/cow/dispatch.rs index 98ed9f0..a2fb602 100644 --- a/crates/sandlock-core/src/cow/dispatch.rs +++ b/crates/sandlock-core/src/cow/dispatch.rs @@ -954,7 +954,9 @@ pub(crate) async fn handle_cow_getdents( let d_ino = std::fs::symlink_metadata(check) .map(|m| m.ino()) .unwrap_or(0); - out.push(build_dirent64(d_ino, d_off, d_type, name)); + if let Some(rec) = build_dirent64(d_ino, d_off, d_type, name) { + out.push(rec); + } } out }; diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index f49725a..1320f35 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -338,7 +338,10 @@ fn parse_proc_net_tcp_port(line: &str) -> Option { /// robust design would store it in an arena, but leaking is acceptable for /// the supervisor's lifetime. fn inject_memfd(content: &[u8]) -> NotifAction { - let memfd = match syscall::memfd_create("sandlock", 0) { + let memfd = match syscall::memfd_create( + "sandlock", + (libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32, + ) { Ok(fd) => fd, Err(_) => return NotifAction::Continue, // fallback: let real open proceed }; @@ -356,6 +359,14 @@ fn inject_memfd(content: &[u8]) -> NotifAction { std::mem::forget(file); } + // Lock the memfd: the child gets an injected fd to the same description + // (memfd_create returns RW), so without sealing they could overwrite the + // synthesised /proc content. Best-effort — if F_ADD_SEALS fails (very old + // kernel without sealing support), we still inject the fd, since the child + // is still bounded by everything else in the policy. + let seals = libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK; + unsafe { libc::fcntl(raw, libc::F_ADD_SEALS, seals) }; + // Move the OwnedFd into InjectFdSend — send_response will close it after the ioctl. NotifAction::InjectFdSend { srcfd: memfd, newfd_flags: libc::O_CLOEXEC as u32 } } @@ -673,7 +684,7 @@ pub(crate) async fn handle_sorted_getdents( let entries: Vec> = names .iter() .enumerate() - .map(|(i, (name, d_type, d_ino))| { + .filter_map(|(i, (name, d_type, d_ino))| { build_dirent64(*d_ino, (i + 1) as i64, *d_type, name) }) .collect(); @@ -729,8 +740,16 @@ pub(crate) const DT_LNK: u8 = 10; /// Build a single linux_dirent64 entry. /// struct linux_dirent64 { u64 d_ino; s64 d_off; u16 d_reclen; u8 d_type; char d_name[]; } /// d_reclen is 8-byte aligned. -pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> Vec { +/// +/// Returns `None` if `name` exceeds the Linux NAME_MAX limit (255 bytes) — +/// such names can't appear in a real dirent stream, and accepting them would +/// produce a record whose `d_reclen` overflows the u16 field. +pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> Option> { + const NAME_MAX: usize = 255; let name_bytes = name.as_bytes(); + if name_bytes.len() > NAME_MAX { + return None; + } let reclen = ((19 + name_bytes.len() + 1) + 7) & !7; // +1 NUL, align to 8 let mut buf = vec![0u8; reclen]; buf[0..8].copy_from_slice(&d_ino.to_ne_bytes()); @@ -738,7 +757,7 @@ pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> buf[16..18].copy_from_slice(&(reclen as u16).to_ne_bytes()); buf[18] = d_type; buf[19..19 + name_bytes.len()].copy_from_slice(name_bytes); - buf + Some(buf) } /// Build a filtered list of dirent64 entries for /proc, hiding PIDs not in the sandbox. @@ -779,7 +798,9 @@ fn build_filtered_dirents(sandbox_pids: &HashSet) -> Vec> { entry.metadata().map(|m| m.st_ino()).unwrap_or(0) }; - entries.push(build_dirent64(d_ino, d_off, d_type, &name_str)); + if let Some(rec) = build_dirent64(d_ino, d_off, d_type, &name_str) { + entries.push(rec); + } } entries } @@ -1111,7 +1132,7 @@ mod tests { #[test] fn test_build_dirent64() { - let entry = build_dirent64(12345, 1, DT_DIR, "1234"); + let entry = build_dirent64(12345, 1, DT_DIR, "1234").unwrap(); assert_eq!(entry.len(), 24); // 19 + 5 = 24, already aligned let d_ino = u64::from_ne_bytes(entry[0..8].try_into().unwrap()); assert_eq!(d_ino, 12345); @@ -1124,11 +1145,17 @@ mod tests { #[test] fn test_build_dirent64_alignment() { - let entry = build_dirent64(1, 1, DT_REG, "ab"); + let entry = build_dirent64(1, 1, DT_REG, "ab").unwrap(); // 19 + 3 = 22, padded to 24 assert_eq!(entry.len(), 24); } + #[test] + fn test_build_dirent64_rejects_oversize_name() { + let name = "x".repeat(256); + assert!(build_dirent64(1, 1, DT_REG, &name).is_none()); + } + #[test] fn test_build_filtered_dirents() { use std::collections::HashSet; diff --git a/crates/sandlock-core/src/random.rs b/crates/sandlock-core/src/random.rs index e23c794..5861a41 100644 --- a/crates/sandlock-core/src/random.rs +++ b/crates/sandlock-core/src/random.rs @@ -63,7 +63,10 @@ pub(crate) fn handle_random_open( } // Create a memfd filled with deterministic PRNG bytes. - let memfd = match syscall::memfd_create("sandlock-random", 0) { + let memfd = match syscall::memfd_create( + "sandlock-random", + (libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32, + ) { Ok(fd) => fd, Err(_) => return Some(NotifAction::Continue), }; @@ -80,6 +83,11 @@ pub(crate) fn handle_random_open( std::mem::forget(file); } + // Seal the memfd — the child gets a fd to the same RW description and + // could otherwise overwrite the deterministic stream. Best-effort. + let seals = libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK; + unsafe { libc::fcntl(raw, libc::F_ADD_SEALS, seals) }; + // Move the OwnedFd into InjectFdSend — send_response will close it after the ioctl. Some(NotifAction::InjectFdSend { srcfd: memfd, newfd_flags: libc::O_CLOEXEC as u32 }) } diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 190dc2f..4592fc7 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -300,7 +300,10 @@ impl Sandbox { let deny = crate::context::deny_syscall_numbers(&policy); let args = crate::context::arg_filters(&policy); - let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args); + let filter = match crate::seccomp::bpf::assemble_filter(&[], &deny, &args) { + Ok(f) => f, + Err(_) => unsafe { libc::_exit(1) }, + }; let _ = crate::seccomp::bpf::install_deny_filter(&filter); CONFINED.store(true, std::sync::atomic::Ordering::Relaxed); diff --git a/crates/sandlock-core/src/seccomp/bpf.rs b/crates/sandlock-core/src/seccomp/bpf.rs index e7d8e9c..b52d1e5 100644 --- a/crates/sandlock-core/src/seccomp/bpf.rs +++ b/crates/sandlock-core/src/seccomp/bpf.rs @@ -47,11 +47,18 @@ pub(crate) fn jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter { /// * `notif_syscalls` — syscalls that generate SECCOMP_RET_USER_NOTIF /// * `deny_syscalls` — syscalls that return ERRNO(EPERM) /// * `arg_block` — pre-built arg filter instructions (from `context::arg_filters`) +/// +/// Returns an error if the resulting program would exceed the kernel's +/// `BPF_MAXINSNS` (4096) instruction limit. Catching this here gives a +/// clearer error than the kernel's `EINVAL` from `seccomp(2)`, and also +/// guards the `(idx - n) as u8` jump-offset arithmetic below — cBPF jump +/// offsets are u8, so a program over 256 instructions plus careless +/// changes could silently truncate offsets. pub fn assemble_filter( notif_syscalls: &[u32], deny_syscalls: &[u32], arg_block: &[SockFilter], -) -> Vec { +) -> Result, std::io::Error> { // ---- compute final layout sizes ---- let arch_block = 2usize; // LD arch, JEQ arch (KILL is in ret section) let arg_block_len = arg_block.len(); @@ -62,6 +69,15 @@ pub fn assemble_filter( let total = arch_block + arg_block_len + load_nr + notif_jmps + deny_jmps + ret_section; + // Linux kernel cBPF program length limit (BPF_MAXINSNS). + const MAX_BPF_INSNS: usize = 4096; + if total > MAX_BPF_INSNS { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("BPF program too large: {} instructions (max {})", total, MAX_BPF_INSNS), + )); + } + // Indices of the four return instructions (absolute, 0-based). let ret_kill_idx = total - 1; @@ -103,7 +119,7 @@ pub fn assemble_filter( prog.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)); // ret_kill_idx debug_assert_eq!(prog.len(), total, "BPF program length mismatch"); - prog + Ok(prog) } // ============================================================ @@ -155,7 +171,7 @@ mod tests { #[test] fn test_empty_filter_has_arch_check_and_allow() { - let prog = assemble_filter(&[], &[], &[]); + let prog = assemble_filter(&[], &[], &[]).unwrap(); assert!(prog.len() >= 5); // First instruction loads arch assert_eq!(prog[0].code, BPF_LD | BPF_W | BPF_ABS); @@ -164,7 +180,7 @@ mod tests { #[test] fn test_deny_syscall_present() { - let prog = assemble_filter(&[], &[libc::SYS_mount as u32], &[]); + let prog = assemble_filter(&[], &[libc::SYS_mount as u32], &[]).unwrap(); let has_mount = prog .iter() .any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K) && f.k == libc::SYS_mount as u32); @@ -173,7 +189,7 @@ mod tests { #[test] fn test_notif_syscall_present() { - let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]); + let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]).unwrap(); let has_openat = prog .iter() .any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K) && f.k == libc::SYS_openat as u32); @@ -182,7 +198,7 @@ mod tests { #[test] fn test_arch_jf_lands_on_kill() { - let prog = assemble_filter(&[], &[], &[]); + let prog = assemble_filter(&[], &[], &[]).unwrap(); // prog[1] is the JEQ arch check; jf should reach the KILL return. let arch_jeq = &prog[1]; assert_eq!(arch_jeq.code, BPF_JMP | BPF_JEQ | BPF_K); @@ -197,7 +213,7 @@ mod tests { #[test] fn test_default_allow_is_before_returns() { - let prog = assemble_filter(&[libc::SYS_openat as u32], &[libc::SYS_mount as u32], &[]); + let prog = assemble_filter(&[libc::SYS_openat as u32], &[libc::SYS_mount as u32], &[]).unwrap(); // RET section is last 4 instructions; first of them is ALLOW. let allow_instr = &prog[prog.len() - 4]; assert_eq!(allow_instr.code, BPF_RET | BPF_K); @@ -206,7 +222,7 @@ mod tests { #[test] fn test_notif_jt_lands_on_user_notif() { - let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]); + let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]).unwrap(); // USER_NOTIF return is at prog.len()-3. let ret_notif_idx = prog.len() - 3; // arch_block=2, arg_blocks=0, LD NR at index 2, notif JEQ at index 3. @@ -229,7 +245,7 @@ mod tests { jump(BPF_JMP | BPF_JSET | BPF_K, 0x0200_0000, 0, 1), stmt(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | EPERM as u32), ]; - let prog = assemble_filter(&[], &[], &arg_block); + let prog = assemble_filter(&[], &[], &arg_block).unwrap(); // Arch block = 2, arg block starts at index 2. // [2] LD NR assert_eq!(prog[2].code, BPF_LD | BPF_W | BPF_ABS); @@ -244,4 +260,16 @@ mod tests { assert_eq!(prog[5].code, BPF_JMP | BPF_JSET | BPF_K); assert_eq!(prog[5].k, 0x0200_0000); } + + #[test] + fn test_oversized_filter_is_rejected() { + // 4097 distinct deny entries + the 7-instruction frame > 4096. + let deny: Vec = (0..4097u32).collect(); + let res = assemble_filter(&[], &deny, &[]); + let err = match res { + Ok(_) => panic!("expected oversize error"), + Err(e) => e, + }; + assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput); + } }