From b23f0037b182a531635ec90ca953282a7b4d8a55 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 5 May 2026 12:16:46 +0800 Subject: [PATCH] Honor O_PATH semantics across fd consumers Linux O_PATH descriptors carry only path reference: read, write, lseek, ftruncate, fsync/fdatasync, flock, ioctl, fchmod, fchown, getdents64, fsetxattr, and fremovexattr must all return EBADF. Our translate_open_flags maps O_PATH to a plain O_RDONLY host fd, so without explicit gating the host call would silently succeed and diverge from Linux semantics on every one of those paths. Add host_fd_ref_open_io in syscall/internal.h that snapshots the fd entry and rejects FD_PATH with -EBADF. Wire it into sys_lseek and sys_ftruncate (fs.c), sc_fsync_common and sc_flock (syscall.c), and sys_fsetxattr / sys_fremovexattr (fs-xattr.c). sys_getdents64 gains an explicit FD_PATH gate ahead of the dir==NULL check so an O_PATH directory reports EBADF rather than ENOTDIR. io.c's host_fd_ref_open_regular_io now delegates to the shared helper. fchdir, fstat, fstatfs, close, dup, fcntl(CLOEXEC/DUPFD/GETFL), and *at() dirfd usage stay allowed, matching Linux. Audit also surfaced an aarch64 syscall-number bug. SYS_fgetxattr was 16 (real value 10) and SYS_fremovexattr was 18 (real value 16) per include/uapi/asm-generic/unistd.h. Guest fremovexattr was being dispatched to the fgetxattr handler, which then ran with garbage size/value pointers from the unused argument registers; the missing FD_PATH check kept this hidden until tests/test-opath.c exercised it. Numbers in src/syscall/abi.h now match upstream. --- src/syscall/abi.h | 14 +- src/syscall/fs-xattr.c | 18 ++- src/syscall/fs.c | 31 +++-- src/syscall/internal.h | 24 ++++ src/syscall/io.c | 21 +-- src/syscall/syscall.c | 10 +- tests/manifest.txt | 3 + tests/test-opath.c | 292 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 378 insertions(+), 35 deletions(-) create mode 100644 tests/test-opath.c diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 6315039..c6bfb10 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -161,19 +161,19 @@ #define SYS_copy_file_range 285 #define SYS_statx 291 #define SYS_rseq 293 -/* xattr syscalls */ -#define SYS_lgetxattr 9 +/* xattr syscalls (numbers match aarch64 asm-generic/unistd.h) */ +#define SYS_setxattr 5 #define SYS_lsetxattr 6 +#define SYS_fsetxattr 7 #define SYS_getxattr 8 -#define SYS_setxattr 5 +#define SYS_lgetxattr 9 +#define SYS_fgetxattr 10 #define SYS_listxattr 11 #define SYS_llistxattr 12 +#define SYS_flistxattr 13 #define SYS_removexattr 14 #define SYS_lremovexattr 15 -#define SYS_fgetxattr 16 -#define SYS_fsetxattr 7 -#define SYS_flistxattr 13 -#define SYS_fremovexattr 18 +#define SYS_fremovexattr 16 /* chroot */ #define SYS_chroot 51 /* network batch I/O */ diff --git a/src/syscall/fs-xattr.c b/src/syscall/fs-xattr.c index b368c20..29c3ef5 100644 --- a/src/syscall/fs-xattr.c +++ b/src/syscall/fs-xattr.c @@ -219,8 +219,12 @@ int64_t sys_fsetxattr(guest_t *g, int flags) { host_fd_ref_t host_ref; - if (host_fd_ref_open(fd, &host_ref) < 0) - return -LINUX_EBADF; + /* Linux: fsetxattr on an O_PATH fd returns EBADF (the descriptor lacks the + * write reference required by mnt_want_write_file). + */ + int64_t err = host_fd_ref_open_io(fd, &host_ref); + if (err < 0) + return err; char name[LINUX_XATTR_NAME_MAX + 1]; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) { @@ -229,7 +233,7 @@ int64_t sys_fsetxattr(guest_t *g, } void *buf; - int64_t err = xattr_alloc_buf(size, &buf); + err = xattr_alloc_buf(size, &buf); if (err < 0) { host_fd_ref_close(&host_ref); return err; @@ -284,8 +288,12 @@ int64_t sys_flistxattr(guest_t *g, int fd, uint64_t list_gva, uint64_t size) int64_t sys_fremovexattr(guest_t *g, int fd, uint64_t name_gva) { host_fd_ref_t host_ref; - if (host_fd_ref_open(fd, &host_ref) < 0) - return -LINUX_EBADF; + /* Linux: fremovexattr on an O_PATH fd returns EBADF, same reason as + * fsetxattr above. + */ + int64_t err = host_fd_ref_open_io(fd, &host_ref); + if (err < 0) + return err; char name[LINUX_XATTR_NAME_MAX + 1]; if (guest_read_str(g, name_gva, name, sizeof(name)) < 0) { diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 283ef7b..f7860f5 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -706,6 +706,12 @@ int64_t sys_getdents64(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) return -LINUX_EBADF; if (fd_table[fd].type == FD_CLOSED) return -LINUX_EBADF; + /* Linux: getdents on an O_PATH fd returns EBADF, even when the underlying + * inode is a directory. The early gate keeps the next NOTDIR fallback + * specific to non-directory regular fds. + */ + if (fd_table[fd].type == FD_PATH) + return -LINUX_EBADF; DIR *dir = (DIR *) fd_table[fd].dir; if (!dir) @@ -910,8 +916,9 @@ int64_t sys_pipe2(guest_t *g, uint64_t fds_gva, int linux_flags) int64_t sys_lseek(int fd, int64_t offset, int whence) { host_fd_ref_t host_ref; - if (host_fd_ref_open(fd, &host_ref) < 0) - return -LINUX_EBADF; + int64_t err = host_fd_ref_open_io(fd, &host_ref); + if (err < 0) + return err; off_t ret = lseek(host_ref.fd, offset, whence); host_fd_ref_close(&host_ref); @@ -1433,18 +1440,22 @@ int64_t sys_faccessat(guest_t *g, int64_t sys_ftruncate(int fd, int64_t length) { + fd_entry_t snap; + if (!fd_snapshot(fd, &snap)) + return -LINUX_EBADF; + /* Linux: ftruncate on an O_PATH fd returns EBADF. */ + if (snap.type == FD_PATH) + return -LINUX_EBADF; + + /* Enforce memfd seals on truncate. */ + int seals = snap.seals; + if (seals & LINUX_F_SEAL_WRITE) + return -LINUX_EPERM; + host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) return -LINUX_EBADF; - /* Enforce memfd seals on truncate. - * fd_to_host above already validated fd is in range. - */ - int seals = fd_table[fd].seals; - if (seals & LINUX_F_SEAL_WRITE) { - host_fd_ref_close(&host_ref); - return -LINUX_EPERM; - } if (seals & (LINUX_F_SEAL_SHRINK | LINUX_F_SEAL_GROW)) { struct stat st; if (fstat(host_ref.fd, &st) == 0) { diff --git a/src/syscall/internal.h b/src/syscall/internal.h index 625b5c6..3a5a2d1 100644 --- a/src/syscall/internal.h +++ b/src/syscall/internal.h @@ -235,6 +235,30 @@ static inline int host_dirfd_ref_open(guest_fd_t dirfd, host_fd_ref_t *ref) return host_fd_ref_open(dirfd, ref); } +/* Open a host fd reference, rejecting O_PATH (FD_PATH) entries with -EBADF. + * Use this for syscalls that operate on the underlying file -- read/write, + * lseek, ftruncate, fsync/fdatasync, flock, fsetxattr/fremovexattr, ioctl, etc. + * Linux returns EBADF on those calls when the fd was opened O_PATH; the host fd + * here is a plain O_RDONLY descriptor, so without this gate the host call would + * silently succeed and diverge from Linux semantics. + * + * Calls that are explicitly allowed on O_PATH (fstat, fstatfs, fchdir, close, + * dup, fcntl get/set CLOEXEC, *at() dirfd) keep using host_{fd,dirfd}_ref_open + * helpers above. + */ +static inline int64_t host_fd_ref_open_io(guest_fd_t guest_fd, + host_fd_ref_t *ref) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return -LINUX_EBADF; + if (snap.type == FD_PATH) + return -LINUX_EBADF; + if (host_fd_ref_open(guest_fd, ref) < 0) + return -LINUX_EBADF; + return 0; +} + /* Read a guest path string with small-buffer optimization. * * Tries the stack-allocated short_buf first; falls back to long_buf for diff --git a/src/syscall/io.c b/src/syscall/io.c index 3b40c03..b938f42 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -461,19 +461,22 @@ static int64_t host_fd_ref_open_checked(int guest_fd, host_fd_ref_t *ref, bool check_write_seal) { - fd_entry_t snap; - if (!fd_snapshot(guest_fd, &snap)) - return -LINUX_EBADF; - if (snap.type == FD_PATH) - return -LINUX_EBADF; - if (check_write_seal && (snap.seals & LINUX_F_SEAL_WRITE)) - return -LINUX_EPERM; - return host_fd_ref_open(guest_fd, ref) < 0 ? -LINUX_EBADF : 0; + if (check_write_seal) { + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return -LINUX_EBADF; + if (snap.type == FD_PATH) + return -LINUX_EBADF; + if (snap.seals & LINUX_F_SEAL_WRITE) + return -LINUX_EPERM; + return host_fd_ref_open(guest_fd, ref) < 0 ? -LINUX_EBADF : 0; + } + return host_fd_ref_open_io(guest_fd, ref); } static int64_t host_fd_ref_open_regular_io(int guest_fd, host_fd_ref_t *ref) { - return host_fd_ref_open_checked(guest_fd, ref, false); + return host_fd_ref_open_io(guest_fd, ref); } static int64_t proc_try_writev_intercept(int fd, diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index edcc09b..7d82918 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1013,8 +1013,9 @@ static int64_t sc_flock(guest_t *g, (void) x5; (void) verbose; host_fd_ref_t host_ref; - if (host_fd_ref_open((int) x0, &host_ref) < 0) - return -LINUX_EBADF; + int64_t err = host_fd_ref_open_io((int) x0, &host_ref); + if (err < 0) + return err; int64_t ret = flock(host_ref.fd, (int) x1) < 0 ? linux_errno() : 0; host_fd_ref_close(&host_ref); return ret; @@ -1038,8 +1039,9 @@ static int64_t sc_fsync_common(guest_t *g, (void) x5; (void) verbose; host_fd_ref_t host_ref; - if (host_fd_ref_open((int) x0, &host_ref) < 0) - return -LINUX_EBADF; + int64_t err = host_fd_ref_open_io((int) x0, &host_ref); + if (err < 0) + return err; int64_t ret = (fsync(host_ref.fd) < 0) ? linux_errno() : 0; host_fd_ref_close(&host_ref); return ret; diff --git a/tests/manifest.txt b/tests/manifest.txt index 789acfd..69be913 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -86,6 +86,9 @@ test-cow-fork [section] O_CLOEXEC tests test-cloexec +[section] O_PATH semantics tests +test-opath + [section] Guard page / mmap edge cases test-guard-page diff --git a/tests/test-opath.c b/tests/test-opath.c new file mode 100644 index 0000000..c4fa84b --- /dev/null +++ b/tests/test-opath.c @@ -0,0 +1,292 @@ +/* O_PATH semantics tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Linux O_PATH descriptors carry only a path reference: read/write/lseek/ + * ioctl/fsync/flock/ftruncate/fchmod/fchown/getdents/fsetxattr/fremovexattr + * must all return EBADF. fstat/fstatfs/close/dup/fcntl-cloexec/fchdir and + * use as a *at() dirfd remain valid. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +#ifndef O_PATH +#define O_PATH 010000000 +#endif + +int passes = 0, fails = 0; + +static char tmp_file[256]; +static char tmp_dir[256]; + +static void setup_fixtures(void) +{ + snprintf(tmp_file, sizeof(tmp_file), "/tmp/elfuse-opath-%d.txt", + (int) getpid()); + snprintf(tmp_dir, sizeof(tmp_dir), "/tmp/elfuse-opath-dir-%d", + (int) getpid()); + + int fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd >= 0) { + write(fd, "hello\n", 6); + close(fd); + } + mkdir(tmp_dir, 0755); + char child[300]; + snprintf(child, sizeof(child), "%s/child", tmp_dir); + fd = open(child, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd >= 0) + close(fd); +} + +static void teardown_fixtures(void) +{ + char child[300]; + snprintf(child, sizeof(child), "%s/child", tmp_dir); + unlink(child); + rmdir(tmp_dir); + unlink(tmp_file); +} + +/* Helper: open an O_PATH fd or skip the test on failure. */ +static int open_path_or_fail(const char *path, const char *what) +{ + int fd = open(path, O_PATH); + if (fd < 0) + FAIL(what); + return fd; +} + +static void test_open_path_smoke(void) +{ + TEST("open(O_PATH) on regular file"); + int fd = open(tmp_file, O_PATH); + if (fd < 0) { + FAIL("open O_PATH"); + return; + } + PASS(); + + TEST("F_GETFL reflects O_PATH"); + int fl = fcntl(fd, F_GETFL); + EXPECT_TRUE(fl != -1 && (fl & O_PATH), "F_GETFL missing O_PATH"); + + close(fd); +} + +static void test_read_write_rejected(void) +{ + int fd = open_path_or_fail(tmp_file, "open"); + if (fd < 0) + return; + + char buf[8]; + TEST("read(O_PATH) -> EBADF"); + EXPECT_ERRNO(read(fd, buf, sizeof(buf)), EBADF, "read"); + TEST("write(O_PATH) -> EBADF"); + EXPECT_ERRNO(write(fd, "x", 1), EBADF, "write"); + TEST("pread(O_PATH) -> EBADF"); + EXPECT_ERRNO(pread(fd, buf, sizeof(buf), 0), EBADF, "pread"); + TEST("pwrite(O_PATH) -> EBADF"); + EXPECT_ERRNO(pwrite(fd, "x", 1, 0), EBADF, "pwrite"); + + struct iovec iov = {.iov_base = buf, .iov_len = sizeof(buf)}; + TEST("readv(O_PATH) -> EBADF"); + EXPECT_ERRNO(readv(fd, &iov, 1), EBADF, "readv"); + iov.iov_base = "x"; + iov.iov_len = 1; + TEST("writev(O_PATH) -> EBADF"); + EXPECT_ERRNO(writev(fd, &iov, 1), EBADF, "writev"); + + close(fd); +} + +static void test_seek_truncate_sync_lock_rejected(void) +{ + int fd = open_path_or_fail(tmp_file, "open"); + if (fd < 0) + return; + + TEST("lseek(O_PATH) -> EBADF"); + EXPECT_ERRNO(lseek(fd, 0, SEEK_SET), EBADF, "lseek"); + TEST("ftruncate(O_PATH) -> EBADF"); + EXPECT_ERRNO(ftruncate(fd, 0), EBADF, "ftruncate"); + TEST("fsync(O_PATH) -> EBADF"); + EXPECT_ERRNO(fsync(fd), EBADF, "fsync"); + TEST("fdatasync(O_PATH) -> EBADF"); + EXPECT_ERRNO(fdatasync(fd), EBADF, "fdatasync"); + TEST("flock(O_PATH) -> EBADF"); + EXPECT_ERRNO(flock(fd, LOCK_EX | LOCK_NB), EBADF, "flock"); + + close(fd); +} + +static void test_ioctl_fchmod_fchown_rejected(void) +{ + int fd = open_path_or_fail(tmp_file, "open"); + if (fd < 0) + return; + + int avail = 0; + TEST("ioctl(O_PATH, FIONREAD) -> EBADF"); + EXPECT_ERRNO(ioctl(fd, FIONREAD, &avail), EBADF, "ioctl"); + TEST("fchmod(O_PATH) -> EBADF"); + EXPECT_ERRNO(fchmod(fd, 0644), EBADF, "fchmod"); + TEST("fchown(O_PATH) -> EBADF"); + EXPECT_ERRNO(fchown(fd, getuid(), getgid()), EBADF, "fchown"); + + close(fd); +} + +static void test_xattr_rejected(void) +{ + int fd = open_path_or_fail(tmp_file, "open"); + if (fd < 0) + return; + + /* The setxattr/removexattr path may also return ENOTSUP/EOPNOTSUPP on + * filesystems that lack xattr support. Only require EBADF from elfuse; + * skip these checks on a filesystem that rejects xattr outright on a + * fresh /tmp file (rare; APFS and ext4 both honor user.* xattrs). + */ + int probe = setxattr(tmp_file, "user.elfuse_probe", "x", 1, 0); + if (probe < 0 && errno != EEXIST) { + if (errno == ENOTSUP || errno == EOPNOTSUPP) { + close(fd); + return; + } + } else { + removexattr(tmp_file, "user.elfuse_probe"); + } + + TEST("fsetxattr(O_PATH) -> EBADF"); + EXPECT_ERRNO(fsetxattr(fd, "user.elfuse_test", "v", 1, 0), EBADF, + "fsetxattr"); + TEST("fremovexattr(O_PATH) -> EBADF"); + EXPECT_ERRNO(fremovexattr(fd, "user.elfuse_test"), EBADF, "fremovexattr"); + + close(fd); +} + +static void test_getdents_rejected(void) +{ + int fd = open(tmp_dir, O_PATH | O_DIRECTORY); + if (fd < 0) { + FAIL("open O_PATH dir"); + return; + } + + /* glibc's readdir() works through fdopendir, but fdopendir itself uses + * fstat + getdents64. The kernel returns EBADF on getdents64; glibc + * surfaces that via readdir errno or fdopendir failure. Use the raw + * syscall to keep the contract precise. + */ + char buf[1024]; + TEST("getdents64(O_PATH dir) -> EBADF"); + long n = syscall(SYS_getdents64, fd, buf, sizeof(buf)); + if (n == -1 && errno == EBADF) + PASS(); + else + FAIL("getdents64"); + + close(fd); +} + +static void test_allowed_on_path_fd(void) +{ + int fd = open_path_or_fail(tmp_file, "open"); + if (fd < 0) + return; + + struct stat st; + TEST("fstat(O_PATH) succeeds"); + EXPECT_TRUE(fstat(fd, &st) == 0 && S_ISREG(st.st_mode), "fstat"); + + TEST("dup(O_PATH) succeeds"); + int dupfd = dup(fd); + if (dupfd < 0) { + FAIL("dup"); + } else { + int fl = fcntl(dupfd, F_GETFL); + if (fl != -1 && (fl & O_PATH)) + PASS(); + else + FAIL("dup lost O_PATH"); + close(dupfd); + } + + TEST("F_SETFD/F_GETFD on O_PATH"); + if (fcntl(fd, F_SETFD, FD_CLOEXEC) == 0 && + (fcntl(fd, F_GETFD) & FD_CLOEXEC)) + PASS(); + else + FAIL("fcntl FD_CLOEXEC"); + + close(fd); +} + +static void test_path_dir_as_dirfd(void) +{ + int dfd = open(tmp_dir, O_PATH | O_DIRECTORY); + if (dfd < 0) { + FAIL("open O_PATH dir"); + return; + } + + struct stat st; + TEST("fstatat(O_PATH dirfd, \"child\")"); + EXPECT_TRUE(fstatat(dfd, "child", &st, 0) == 0, "fstatat via O_PATH"); + + TEST("openat(O_PATH dirfd, \"child\")"); + int childfd = openat(dfd, "child", O_RDONLY); + if (childfd >= 0) { + PASS(); + close(childfd); + } else { + FAIL("openat via O_PATH"); + } + + TEST("fchdir(O_PATH dir)"); + EXPECT_TRUE(fchdir(dfd) == 0, "fchdir via O_PATH"); + + close(dfd); +} + +int main(void) +{ + printf("test-opath: O_PATH semantics tests\n"); + + setup_fixtures(); + + test_open_path_smoke(); + test_read_write_rejected(); + test_seek_truncate_sync_lock_rejected(); + test_ioctl_fchmod_fchown_rejected(); + test_xattr_rejected(); + test_getdents_rejected(); + test_allowed_on_path_fd(); + test_path_dir_as_dirfd(); + + teardown_fixtures(); + + SUMMARY("test-opath"); + return fails > 0 ? 1 : 0; +}