Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crates/sandlock-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,8 @@ fn no_supervisor_exec(policy: &Policy, cmd: &[&str]) -> Result<()> {

// 2. Install deny-only seccomp filter (blocks dangerous syscalls without supervisor)
let deny_nrs = sandlock_core::context::no_supervisor_deny_syscall_numbers();
let filter = sandlock_core::seccomp::bpf::assemble_filter(&[], &deny_nrs, &[]);
let filter = sandlock_core::seccomp::bpf::assemble_filter(&[], &deny_nrs, &[])
.map_err(|e| anyhow!("seccomp assemble failed: {}", e))?;
sandlock_core::seccomp::bpf::install_deny_filter(&filter)
.map_err(|e| anyhow!("seccomp deny filter failed: {}", e))?;

Expand Down
10 changes: 8 additions & 2 deletions crates/sandlock-core/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,10 @@ pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair,
if nested {
// Nested sandbox: deny-only filter (no supervisor — parent handles it).
// BPF filters are ANDed by the kernel, so each level can only tighten.
let filter = bpf::assemble_filter(&[], &deny, &args);
let filter = match bpf::assemble_filter(&[], &deny, &args) {
Ok(f) => f,
Err(e) => fail!(format!("seccomp assemble: {}", e)),
};
if let Err(e) = bpf::install_deny_filter(&filter) {
fail!(format!("seccomp deny filter: {}", e));
}
Expand All @@ -864,7 +867,10 @@ pub(crate) fn confine_child(policy: &Policy, cmd: &[CString], pipes: &PipePair,
} else {
// First-level sandbox: notif + deny filter with NEW_LISTENER.
let notif = notif_syscalls(policy);
let filter = bpf::assemble_filter(&notif, &deny, &args);
let filter = match bpf::assemble_filter(&notif, &deny, &args) {
Ok(f) => f,
Err(e) => fail!(format!("seccomp assemble: {}", e)),
};
let notif_fd = match bpf::install_filter(&filter) {
Ok(fd) => fd,
Err(e) => fail!(format!("seccomp install: {}", e)),
Expand Down
4 changes: 3 additions & 1 deletion crates/sandlock-core/src/cow/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,9 @@ pub(crate) async fn handle_cow_getdents(
let d_ino = std::fs::symlink_metadata(check)
.map(|m| m.ino())
.unwrap_or(0);
out.push(build_dirent64(d_ino, d_off, d_type, name));
if let Some(rec) = build_dirent64(d_ino, d_off, d_type, name) {
out.push(rec);
}
}
out
};
Expand Down
41 changes: 34 additions & 7 deletions crates/sandlock-core/src/procfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,10 @@ fn parse_proc_net_tcp_port(line: &str) -> Option<u16> {
/// robust design would store it in an arena, but leaking is acceptable for
/// the supervisor's lifetime.
fn inject_memfd(content: &[u8]) -> NotifAction {
let memfd = match syscall::memfd_create("sandlock", 0) {
let memfd = match syscall::memfd_create(
"sandlock",
(libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32,
) {
Ok(fd) => fd,
Err(_) => return NotifAction::Continue, // fallback: let real open proceed
};
Expand All @@ -356,6 +359,14 @@ fn inject_memfd(content: &[u8]) -> NotifAction {
std::mem::forget(file);
}

// Lock the memfd: the child gets an injected fd to the same description
// (memfd_create returns RW), so without sealing they could overwrite the
// synthesised /proc content. Best-effort — if F_ADD_SEALS fails (very old
// kernel without sealing support), we still inject the fd, since the child
// is still bounded by everything else in the policy.
let seals = libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK;
unsafe { libc::fcntl(raw, libc::F_ADD_SEALS, seals) };

// Move the OwnedFd into InjectFdSend — send_response will close it after the ioctl.
NotifAction::InjectFdSend { srcfd: memfd, newfd_flags: libc::O_CLOEXEC as u32 }
}
Expand Down Expand Up @@ -673,7 +684,7 @@ pub(crate) async fn handle_sorted_getdents(
let entries: Vec<Vec<u8>> = names
.iter()
.enumerate()
.map(|(i, (name, d_type, d_ino))| {
.filter_map(|(i, (name, d_type, d_ino))| {
build_dirent64(*d_ino, (i + 1) as i64, *d_type, name)
})
.collect();
Expand Down Expand Up @@ -729,16 +740,24 @@ pub(crate) const DT_LNK: u8 = 10;
/// Build a single linux_dirent64 entry.
/// struct linux_dirent64 { u64 d_ino; s64 d_off; u16 d_reclen; u8 d_type; char d_name[]; }
/// d_reclen is 8-byte aligned.
pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> Vec<u8> {
///
/// Returns `None` if `name` exceeds the Linux NAME_MAX limit (255 bytes) —
/// such names can't appear in a real dirent stream, and accepting them would
/// produce a record whose `d_reclen` overflows the u16 field.
pub(crate) fn build_dirent64(d_ino: u64, d_off: i64, d_type: u8, name: &str) -> Option<Vec<u8>> {
const NAME_MAX: usize = 255;
let name_bytes = name.as_bytes();
if name_bytes.len() > NAME_MAX {
return None;
}
let reclen = ((19 + name_bytes.len() + 1) + 7) & !7; // +1 NUL, align to 8
let mut buf = vec![0u8; reclen];
buf[0..8].copy_from_slice(&d_ino.to_ne_bytes());
buf[8..16].copy_from_slice(&d_off.to_ne_bytes());
buf[16..18].copy_from_slice(&(reclen as u16).to_ne_bytes());
buf[18] = d_type;
buf[19..19 + name_bytes.len()].copy_from_slice(name_bytes);
buf
Some(buf)
}

/// Build a filtered list of dirent64 entries for /proc, hiding PIDs not in the sandbox.
Expand Down Expand Up @@ -779,7 +798,9 @@ fn build_filtered_dirents(sandbox_pids: &HashSet<i32>) -> Vec<Vec<u8>> {
entry.metadata().map(|m| m.st_ino()).unwrap_or(0)
};

entries.push(build_dirent64(d_ino, d_off, d_type, &name_str));
if let Some(rec) = build_dirent64(d_ino, d_off, d_type, &name_str) {
entries.push(rec);
}
}
entries
}
Expand Down Expand Up @@ -1111,7 +1132,7 @@ mod tests {

#[test]
fn test_build_dirent64() {
let entry = build_dirent64(12345, 1, DT_DIR, "1234");
let entry = build_dirent64(12345, 1, DT_DIR, "1234").unwrap();
assert_eq!(entry.len(), 24); // 19 + 5 = 24, already aligned
let d_ino = u64::from_ne_bytes(entry[0..8].try_into().unwrap());
assert_eq!(d_ino, 12345);
Expand All @@ -1124,11 +1145,17 @@ mod tests {

#[test]
fn test_build_dirent64_alignment() {
let entry = build_dirent64(1, 1, DT_REG, "ab");
let entry = build_dirent64(1, 1, DT_REG, "ab").unwrap();
// 19 + 3 = 22, padded to 24
assert_eq!(entry.len(), 24);
}

#[test]
fn test_build_dirent64_rejects_oversize_name() {
let name = "x".repeat(256);
assert!(build_dirent64(1, 1, DT_REG, &name).is_none());
}

#[test]
fn test_build_filtered_dirents() {
use std::collections::HashSet;
Expand Down
10 changes: 9 additions & 1 deletion crates/sandlock-core/src/random.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ pub(crate) fn handle_random_open(
}

// Create a memfd filled with deterministic PRNG bytes.
let memfd = match syscall::memfd_create("sandlock-random", 0) {
let memfd = match syscall::memfd_create(
"sandlock-random",
(libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING) as u32,
) {
Ok(fd) => fd,
Err(_) => return Some(NotifAction::Continue),
};
Expand All @@ -80,6 +83,11 @@ pub(crate) fn handle_random_open(
std::mem::forget(file);
}

// Seal the memfd — the child gets a fd to the same RW description and
// could otherwise overwrite the deterministic stream. Best-effort.
let seals = libc::F_SEAL_SEAL | libc::F_SEAL_WRITE | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK;
unsafe { libc::fcntl(raw, libc::F_ADD_SEALS, seals) };

// Move the OwnedFd into InjectFdSend — send_response will close it after the ioctl.
Some(NotifAction::InjectFdSend { srcfd: memfd, newfd_flags: libc::O_CLOEXEC as u32 })
}
Expand Down
5 changes: 4 additions & 1 deletion crates/sandlock-core/src/sandbox.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,10 @@ impl Sandbox {

let deny = crate::context::deny_syscall_numbers(&policy);
let args = crate::context::arg_filters(&policy);
let filter = crate::seccomp::bpf::assemble_filter(&[], &deny, &args);
let filter = match crate::seccomp::bpf::assemble_filter(&[], &deny, &args) {
Ok(f) => f,
Err(_) => unsafe { libc::_exit(1) },
};
let _ = crate::seccomp::bpf::install_deny_filter(&filter);

CONFINED.store(true, std::sync::atomic::Ordering::Relaxed);
Expand Down
46 changes: 37 additions & 9 deletions crates/sandlock-core/src/seccomp/bpf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,18 @@ pub(crate) fn jump(code: u16, k: u32, jt: u8, jf: u8) -> SockFilter {
/// * `notif_syscalls` — syscalls that generate SECCOMP_RET_USER_NOTIF
/// * `deny_syscalls` — syscalls that return ERRNO(EPERM)
/// * `arg_block` — pre-built arg filter instructions (from `context::arg_filters`)
///
/// Returns an error if the resulting program would exceed the kernel's
/// `BPF_MAXINSNS` (4096) instruction limit. Catching this here gives a
/// clearer error than the kernel's `EINVAL` from `seccomp(2)`, and also
/// guards the `(idx - n) as u8` jump-offset arithmetic below — cBPF jump
/// offsets are u8, so a program over 256 instructions plus careless
/// changes could silently truncate offsets.
pub fn assemble_filter(
notif_syscalls: &[u32],
deny_syscalls: &[u32],
arg_block: &[SockFilter],
) -> Vec<SockFilter> {
) -> Result<Vec<SockFilter>, std::io::Error> {
// ---- compute final layout sizes ----
let arch_block = 2usize; // LD arch, JEQ arch (KILL is in ret section)
let arg_block_len = arg_block.len();
Expand All @@ -62,6 +69,15 @@ pub fn assemble_filter(

let total = arch_block + arg_block_len + load_nr + notif_jmps + deny_jmps + ret_section;

// Linux kernel cBPF program length limit (BPF_MAXINSNS).
const MAX_BPF_INSNS: usize = 4096;
if total > MAX_BPF_INSNS {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
format!("BPF program too large: {} instructions (max {})", total, MAX_BPF_INSNS),
));
}

// Indices of the four return instructions (absolute, 0-based).
let ret_kill_idx = total - 1;

Expand Down Expand Up @@ -103,7 +119,7 @@ pub fn assemble_filter(
prog.push(stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)); // ret_kill_idx

debug_assert_eq!(prog.len(), total, "BPF program length mismatch");
prog
Ok(prog)
}

// ============================================================
Expand Down Expand Up @@ -155,7 +171,7 @@ mod tests {

#[test]
fn test_empty_filter_has_arch_check_and_allow() {
let prog = assemble_filter(&[], &[], &[]);
let prog = assemble_filter(&[], &[], &[]).unwrap();
assert!(prog.len() >= 5);
// First instruction loads arch
assert_eq!(prog[0].code, BPF_LD | BPF_W | BPF_ABS);
Expand All @@ -164,7 +180,7 @@ mod tests {

#[test]
fn test_deny_syscall_present() {
let prog = assemble_filter(&[], &[libc::SYS_mount as u32], &[]);
let prog = assemble_filter(&[], &[libc::SYS_mount as u32], &[]).unwrap();
let has_mount = prog
.iter()
.any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K) && f.k == libc::SYS_mount as u32);
Expand All @@ -173,7 +189,7 @@ mod tests {

#[test]
fn test_notif_syscall_present() {
let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]);
let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]).unwrap();
let has_openat = prog
.iter()
.any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K) && f.k == libc::SYS_openat as u32);
Expand All @@ -182,7 +198,7 @@ mod tests {

#[test]
fn test_arch_jf_lands_on_kill() {
let prog = assemble_filter(&[], &[], &[]);
let prog = assemble_filter(&[], &[], &[]).unwrap();
// prog[1] is the JEQ arch check; jf should reach the KILL return.
let arch_jeq = &prog[1];
assert_eq!(arch_jeq.code, BPF_JMP | BPF_JEQ | BPF_K);
Expand All @@ -197,7 +213,7 @@ mod tests {

#[test]
fn test_default_allow_is_before_returns() {
let prog = assemble_filter(&[libc::SYS_openat as u32], &[libc::SYS_mount as u32], &[]);
let prog = assemble_filter(&[libc::SYS_openat as u32], &[libc::SYS_mount as u32], &[]).unwrap();
// RET section is last 4 instructions; first of them is ALLOW.
let allow_instr = &prog[prog.len() - 4];
assert_eq!(allow_instr.code, BPF_RET | BPF_K);
Expand All @@ -206,7 +222,7 @@ mod tests {

#[test]
fn test_notif_jt_lands_on_user_notif() {
let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]);
let prog = assemble_filter(&[libc::SYS_openat as u32], &[], &[]).unwrap();
// USER_NOTIF return is at prog.len()-3.
let ret_notif_idx = prog.len() - 3;
// arch_block=2, arg_blocks=0, LD NR at index 2, notif JEQ at index 3.
Expand All @@ -229,7 +245,7 @@ mod tests {
jump(BPF_JMP | BPF_JSET | BPF_K, 0x0200_0000, 0, 1),
stmt(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | EPERM as u32),
];
let prog = assemble_filter(&[], &[], &arg_block);
let prog = assemble_filter(&[], &[], &arg_block).unwrap();
// Arch block = 2, arg block starts at index 2.
// [2] LD NR
assert_eq!(prog[2].code, BPF_LD | BPF_W | BPF_ABS);
Expand All @@ -244,4 +260,16 @@ mod tests {
assert_eq!(prog[5].code, BPF_JMP | BPF_JSET | BPF_K);
assert_eq!(prog[5].k, 0x0200_0000);
}

#[test]
fn test_oversized_filter_is_rejected() {
// 4097 distinct deny entries + the 7-instruction frame > 4096.
let deny: Vec<u32> = (0..4097u32).collect();
let res = assemble_filter(&[], &deny, &[]);
let err = match res {
Ok(_) => panic!("expected oversize error"),
Err(e) => e,
};
assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput);
}
}
Loading