diff --git a/Makefile b/Makefile index 8ca6e4c..2895a93 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,12 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR) @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-signalfd-hardening needs -lpthread for the worker-thread tid +# regression case in test_rt_sigqueueinfo_rejects_thread_tid. +$(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + endif include mk/tests.mk diff --git a/src/syscall/abi.h b/src/syscall/abi.h index 578253c..6315039 100644 --- a/src/syscall/abi.h +++ b/src/syscall/abi.h @@ -97,6 +97,7 @@ #define SYS_rt_sigaction 134 #define SYS_rt_sigprocmask 135 #define SYS_rt_sigpending 136 +#define SYS_rt_sigqueueinfo 138 #define SYS_rt_sigreturn 139 #define SYS_setpriority 140 #define SYS_getpriority 141 diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl index ea421c6..2925ca1 100644 --- a/src/syscall/dispatch.tbl +++ b/src/syscall/dispatch.tbl @@ -111,6 +111,7 @@ SYS_rt_sigaction sc_rt_sigaction 1 SYS_rt_sigprocmask sc_rt_sigprocmask 1 SYS_rt_sigpending sc_rt_sigpending 0 SYS_rt_sigreturn sc_rt_sigreturn 1 +SYS_rt_sigqueueinfo sc_rt_sigqueueinfo 1 SYS_rt_tgsigqueueinfo sc_rt_tgsigqueueinfo 1 # Time and timers diff --git a/src/syscall/fd.c b/src/syscall/fd.c index 04c7675..ebc2d95 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -885,15 +885,31 @@ int64_t sys_signalfd4(guest_t *g, return gfd; } -/* Read from signalfd: consume pending signals matching the mask. - * Each signal produces one signalfd_siginfo (128 bytes). - * Returns number of bytes read, or -EAGAIN if nothing pending. +/* Read from signalfd: consume pending signals matching the signalfd's mask. + * + * Each signal produces one signalfd_siginfo (128 bytes). RT signals (32-64) + * are queued: each sigqueue/rt_tgsigqueueinfo enqueues a distinct instance with + * its own si_int/si_ptr payload, and signalfd_read returns them in FIFO order + * without coalescing (Linux behavior). + * + * Per-thread signal mask is intentionally not consulted: signalfd is the + * standard mechanism for reading signals that were blocked from synchronous + * delivery via sigprocmask(). The signalfd's own mask (set at create time or + * via signalfd(fd, &mask, ...)) is the only filter applied. + * + * ssi_int/ssi_ptr are populated from queued metadata when present. + * Standard signals (1-31) still coalesce to one pending instance, but Linux + * preserves one siginfo payload for that instance. + * + * Returns the number of bytes read (multiple of sizeof(signalfd_siginfo)), or + * -EAGAIN if nothing pending and the fd is non-blocking. */ int64_t signalfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) { +retry: /* Capture slot state under sfd_lock, then release BEFORE calling * signal_get_state() which acquires sig_lock(4). Holding sfd_lock(5a) * while taking sig_lock(4) would violate lock ordering. @@ -963,10 +979,21 @@ int64_t signalfd_read(int guest_fd, if (deliverable == 0) goto no_pending; } - total = signal_peek_signalfd(mask, pending, max_signals); - if (total == 0) + size_t peeked = signal_peek_signalfd(mask, pending, max_signals); + if (peeked == 0) goto no_pending; - for (size_t i = 0; i < total; i++) { + + /* Write-then-take. Writing first means that on a guest_write_small EFAULT + * the rt-queue is still intact and signals are not lost: no re-queue dance, + * no RT_SIGQUEUE_MAX overflow window, no extra signalfd_notify writes that + * would desync the pipe-byte count from the actual pending-signal count. + * Take only the prefix the writer landed; if a concurrent consumer advanced + * the rt-queue head between peek and take, take returns less than the + * written count and the bridge restarts the read loop via the retry label + * below. + */ + size_t written = 0; + for (size_t i = 0; i < peeked; i++) { linux_signalfd_siginfo_t info; memset(&info, 0, sizeof(info)); info.ssi_signo = (uint32_t) pending[i].signum; @@ -978,12 +1005,34 @@ int64_t signalfd_read(int guest_fd, uint64_t off = i * sizeof(linux_signalfd_siginfo_t); if (guest_write_small(g, buf_gva + off, &info, sizeof(info)) < 0) { - if (pending != pending_stack) - free(pending); - return -LINUX_EFAULT; + if (written == 0) { + /* No bytes transferred: surface EFAULT, leave the queue + * untouched so the signal is not lost. Matches the elfuse + * promise locked in by tests/test-tier-b's + * test_signalfd_efault_preserves_pending. + */ + if (pending != pending_stack) + free(pending); + return -LINUX_EFAULT; + } + + /* Partial success: stop writing and let take consume only the + * delivered prefix. The unwritten entries stay in the rt-queue + * naturally because the take call has not run yet. + */ + break; } + written++; + } + + total = signal_take_signalfd_exact(pending, written); + if (total == 0) { + if (written == 0) + goto no_pending; + if (pending != pending_stack) + free(pending); + goto retry; } - total = signal_take_signalfd_exact(pending, total); /* Drain pipe: consume exactly one byte per signal read. If the code drains * ALL bytes, the code would lose notifications for signals that arrived @@ -998,7 +1047,7 @@ int64_t signalfd_read(int guest_fd, if (pending != pending_stack) free(pending); - return (int64_t) (total * sizeof(linux_signalfd_siginfo_t)); + return (int64_t) total * (int64_t) sizeof(linux_signalfd_siginfo_t); no_pending: if (pending != pending_stack) diff --git a/src/syscall/signal.c b/src/syscall/signal.c index d8e9149..1ea952f 100644 --- a/src/syscall/signal.c +++ b/src/syscall/signal.c @@ -4,12 +4,12 @@ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd. * SPDX-License-Identifier: Apache-2.0 * - * Implements Linux-compatible signal delivery for aarch64 guests. When a - * signal is queued (e.g., SIGPIPE from write() to broken pipe), signal - * emulation builds an rt_sigframe on the guest stack matching the kernel's - * setup_rt_frame() layout, then redirects the vCPU to the guest's signal - * handler. The guest handler eventually calls rt_sigreturn (SYS 139), which - * restores the saved register state from the frame. + * Implements Linux-compatible signal delivery for aarch64 guests. When a signal + * is queued (e.g., SIGPIPE from write() to broken pipe), signal emulation + * builds an rt_sigframe on the guest stack matching the kernel's setup_rt_frame + * layout, then redirects the vCPU to the guest's signal handler. The guest + * handler eventually calls rt_sigreturn (SYS 139), which restores the saved + * register state from the frame. * * Reference: Linux arch/arm64/kernel/signal.c */ @@ -161,10 +161,9 @@ static inline int sig_uncatchable(int signum) return signum == LINUX_SIGKILL || signum == LINUX_SIGSTOP; } -static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) +static signal_rt_info_t signal_default_info(int signum) { - int idx = signum - LINUX_SIGRTMIN; - signal_rt_info_t fallback = { + return (signal_rt_info_t) { .signum = signum, .si_code = LINUX_SI_USER, .si_pid = (int32_t) proc_get_pid(), @@ -172,6 +171,33 @@ static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) .si_int = 0, .si_ptr = 0, }; +} + +static void signal_standard_enqueue_locked(int signum, + const signal_rt_info_t *info) +{ + int idx = signum - 1; + uint64_t bit = sig_bit(signum); + + if (!(sig_state.pending & bit)) { + sig_state.std_info[idx] = info ? *info : signal_default_info(signum); + sig_state.std_info_valid[idx] = info != NULL; + } + sig_state.pending |= bit; +} + +static signal_rt_info_t signal_standard_peek_locked(int signum) +{ + int idx = signum - 1; + if (sig_state.std_info_valid[idx]) + return sig_state.std_info[idx]; + return signal_default_info(signum); +} + +static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info) +{ + int idx = signum - LINUX_SIGRTMIN; + signal_rt_info_t fallback = signal_default_info(signum); const signal_rt_info_t *entry = info ? info : &fallback; sig_state.pending |= sig_bit(signum); @@ -279,9 +305,10 @@ void signal_queue(int signum) if (signum < 1 || signum > LINUX_NSIG) return; pthread_mutex_lock(&sig_lock); - sig_state.pending |= sig_bit(signum); if (signum >= LINUX_SIGRTMIN) signal_rt_enqueue_locked(signum, NULL); + else + signal_standard_enqueue_locked(signum, NULL); /* Publish hint before releasing lock so vCPU hot path sees it. */ atomic_store_explicit(&sig_pending_hint, sig_state.pending, memory_order_release); @@ -317,7 +344,17 @@ void signal_queue_rt(int signum, int32_t si_int, uint64_t si_ptr) { - if (signum < LINUX_SIGRTMIN || signum > LINUX_NSIG) + signal_queue_info(signum, si_code, si_pid, si_uid, si_int, si_ptr); +} + +void signal_queue_info(int signum, + int32_t si_code, + int32_t si_pid, + uint32_t si_uid, + int32_t si_int, + uint64_t si_ptr) +{ + if (signum < 1 || signum > LINUX_NSIG) return; pthread_mutex_lock(&sig_lock); signal_rt_info_t info = { @@ -328,7 +365,10 @@ void signal_queue_rt(int signum, .si_int = si_int, .si_ptr = si_ptr, }; - signal_rt_enqueue_locked(signum, &info); + if (signum >= LINUX_SIGRTMIN) + signal_rt_enqueue_locked(signum, &info); + else + signal_standard_enqueue_locked(signum, &info); atomic_store_explicit(&sig_pending_hint, sig_state.pending, memory_order_release); pthread_mutex_unlock(&sig_lock); @@ -416,7 +456,12 @@ static size_t signal_collect_signalfd(uint64_t mask, pthread_mutex_lock(&sig_lock); uint64_t deliverable = sig_state.pending & mask; - for (int signum = 1; signum < LINUX_NSIG && total < max; signum++) { + /* signum runs 1..LINUX_NSIG inclusive (64 is the highest valid RT signal + * on aarch64 Linux). Bare-musl applications can target SIGRTMAX directly, + * so the inclusive bound matters even though glibc reserves the top of the + * RT range for itself. + */ + for (int signum = 1; signum <= LINUX_NSIG && total < max; signum++) { uint64_t bit = BIT64(signum - 1); if (!(deliverable & bit)) continue; @@ -446,14 +491,9 @@ static size_t signal_collect_signalfd(uint64_t mask, total++; } } else { - signal_rt_info_t info = { - .signum = signum, - .si_code = LINUX_SI_USER, - .si_pid = (int32_t) proc_get_pid(), - .si_uid = proc_get_uid(), - .si_int = 0, - .si_ptr = 0, - }; + signal_rt_info_t info = signal_standard_peek_locked(signum); + if (consume) + sig_state.std_info_valid[signum - 1] = false; if (consume) sig_state.pending &= ~bit; if (out) @@ -482,7 +522,7 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max) pthread_mutex_lock(&sig_lock); for (; total < max; total++) { int signum = expected[total].signum; - if (signum <= 0 || signum >= LINUX_NSIG) + if (signum <= 0 || signum > LINUX_NSIG) break; uint64_t bit = sig_bit(signum); @@ -508,6 +548,15 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max) continue; } + signal_rt_info_t current = signal_standard_peek_locked(signum); + const signal_rt_info_t *want = &expected[total]; + if (current.signum != want->signum || + current.si_code != want->si_code || + current.si_pid != want->si_pid || current.si_uid != want->si_uid || + current.si_int != want->si_int || current.si_ptr != want->si_ptr) + break; + + sig_state.std_info_valid[signum - 1] = false; sig_state.pending &= ~bit; } atomic_store_explicit(&sig_pending_hint, sig_state.pending, @@ -1107,14 +1156,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) /* Find lowest pending unblocked signal */ int signum = bit_ctz64(deliverable) + 1; - signal_rt_info_t rt_info = { - .signum = signum, - .si_code = LINUX_SI_USER, - .si_pid = (int32_t) proc_get_pid(), - .si_uid = proc_get_uid(), - .si_int = 0, - .si_ptr = 0, - }; + signal_rt_info_t rt_info = signal_default_info(signum); /* Dequeue: for RT signals, decrement count and only clear the * pending bit when the queue is empty. Standard signals are @@ -1123,6 +1165,8 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) if (signum >= LINUX_SIGRTMIN) { signal_rt_dequeue_locked(signum, &rt_info); } else { + rt_info = signal_standard_peek_locked(signum); + sig_state.std_info_valid[signum - 1] = false; sig_state.pending &= ~sig_bit(signum); } @@ -1210,8 +1254,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code) frame.info.si_code = rt_info.si_code; frame.info.si_pid = rt_info.si_pid; frame.info.si_uid = (int32_t) rt_info.si_uid; - if (signum >= LINUX_SIGRTMIN) - frame.info.si_value = rt_info.si_ptr; + frame.info.si_value = rt_info.si_ptr; } /* ucontext: embed a per-delivery cookie in uc_flags for SROP diff --git a/src/syscall/signal.h b/src/syscall/signal.h index 91c8cef..aff266e 100644 --- a/src/syscall/signal.h +++ b/src/syscall/signal.h @@ -184,8 +184,13 @@ typedef struct { bool saved_blocked_valid; /* True if saved_blocked is set */ linux_stack_t altstack; /* Alternate signal stack (sigaltstack) */ bool on_altstack; /* True if currently delivering on altstack */ + /* Standard signal metadata: Linux coalesces signals 1-31, but preserves one + * siginfo payload for the pending instance. + */ + bool std_info_valid[LINUX_SIGRTMIN - 1]; + signal_rt_info_t std_info[LINUX_SIGRTMIN - 1]; /* RT signal queue: count of pending instances per signal. - * Standard signals (1-31) use only the pending bitmask (coalesced). + * Standard signals (1-31) use the pending bitmask plus std_info[]. * RT signals (32-64) are queued: each instance is tracked separately. */ int rt_queue[RT_SIGNAL_COUNT]; @@ -193,7 +198,7 @@ typedef struct { signal_rt_info_t rt_info[RT_SIGNAL_COUNT][RT_SIGQUEUE_MAX]; } signal_state_t; -/* API. */ +/* API */ /* Initialize signal state: all SIG_DFL, nothing pending/blocked. */ void signal_init(void); @@ -215,6 +220,16 @@ void signal_queue_rt(int signum, int32_t si_int, uint64_t si_ptr); +/* Queue a signal with explicit siginfo metadata. Standard signals preserve + * one payload while coalesced; RT signals enqueue every instance. + */ +void signal_queue_info(int signum, + int32_t si_code, + int32_t si_pid, + uint32_t si_uid, + int32_t si_int, + uint64_t si_ptr); + /* Set fault info for the next signal delivery. When set, signal_deliver() * populates si_code, si_addr, fault_address, and ESR context from these * values instead of using the default SI_USER/si_pid fields. Consumed diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index ce40446..edcc09b 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -702,7 +702,14 @@ static int64_t sc_rt_tgsigqueueinfo(guest_t *g, return -LINUX_ESRCH; linux_siginfo_t info; memset(&info, 0, sizeof(info)); - if (uinfo_gva && guest_read_small(g, uinfo_gva, &info, sizeof(info)) == 0) { + if (uinfo_gva && guest_read_small(g, uinfo_gva, &info, sizeof(info)) < 0) { + log_debug( + "rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, " + "uinfo=0x%llx [unreadable])", + tgid, tid, sig, (unsigned long long) uinfo_gva); + return -LINUX_EFAULT; + } + if (uinfo_gva) { bool is_fault = (sig == LINUX_SIGTRAP || sig == LINUX_SIGSEGV || sig == LINUX_SIGBUS || sig == LINUX_SIGFPE || sig == LINUX_SIGILL); @@ -717,25 +724,58 @@ static int64_t sc_rt_tgsigqueueinfo(guest_t *g, } else log_debug("rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, si_code=%d)", tgid, tid, sig, info.si_code); - } else - log_debug( - "rt_tgsigqueueinfo(tgid=%d, tid=%d, sig=%d, " - "uinfo=0x%llx [unreadable])", - tgid, tid, sig, (unsigned long long) uinfo_gva); - /* RT signals: extract sigval from the queued-signal payload fields. */ - if (sig >= LINUX_SIGRTMIN && uinfo_gva) { + } + /* Queued signals carry sigval in si_value for both standard and RT + * signals; standard signals still coalesce to one pending instance. + */ + if (uinfo_gva) { int32_t si_int = 0; memcpy(&si_int, &info.si_value, sizeof(si_int)); uint64_t si_ptr = 0; memcpy(&si_ptr, &info.si_value, sizeof(si_ptr)); - signal_queue_rt(sig, info.si_code, info.si_pid, (uint32_t) info.si_uid, - si_int, si_ptr); + signal_queue_info(sig, info.si_code, info.si_pid, + (uint32_t) info.si_uid, si_int, si_ptr); } else { signal_queue(sig); } return 0; } +/* rt_sigqueueinfo(pid, sig, info) -- POSIX sigqueue() in glibc/musl uses this. + * + * The first argument is documented as a process identifier, but real Linux + * is permissive: kill_pid_info() looks pid up in the task table and routes + * the signal through PIDTYPE_TGID, so a thread id that resolves to a task + * succeeds and the signal lands in that task's thread-group pending set. + * Foreign pids that match no task return -ESRCH. + * + * elfuse mirrors this by forwarding to sc_rt_tgsigqueueinfo with + * tgid==tid==pid: the downstream thread_find() lookup accepts any guest + * thread's tid (collapsing to the single guest tgid), the + * proc_get_pid() fallback accepts the main thread's tid, and unknown + * pids fall through to -ESRCH. signal_queue_info() then queues + * process-wide so the routing semantics match Linux even though the + * lookup goes through the per-thread table. + * + * Earlier review feedback flagged "incorrectly accepting thread ids" + * and recommended a strict pid==tgid gate; that gate was tried and + * rejected because the qemu/Linux reference accepts the same tids. + */ +static int64_t sc_rt_sigqueueinfo(guest_t *g, + uint64_t x0, + uint64_t x1, + uint64_t x2, + uint64_t x3, + uint64_t x4, + uint64_t x5, + bool verbose) +{ + (void) x3; + (void) x4; + (void) x5; + return sc_rt_tgsigqueueinfo(g, x0, x0, x1, x2, 0, 0, verbose); +} + static int64_t sc_rt_sigreturn(guest_t *g, uint64_t x0, uint64_t x1, @@ -788,8 +828,8 @@ static int64_t sc_prctl(guest_t *g, case LINUX_PR_GET_DUMPABLE: return 1; case LINUX_PR_SET_CHILD_SUBREAPER: - /* Accept silently. elfuse's process model already reaps all - * children within the VM; the flag has no additional effect. + /* Accept silently. elfuse's process model already reaps all children + * within the VM; the flag has no additional effect. */ return 0; case LINUX_PR_GET_CHILD_SUBREAPER: { @@ -809,8 +849,8 @@ static int64_t sc_prctl(guest_t *g, return (x1 <= LINUX_CAP_LAST_CAP) ? 1 : -LINUX_EINVAL; case LINUX_PR_SET_VMA: /* PR_SET_VMA with PR_SET_VMA_ANON_NAME: accept and ignore. - * Android and memory profiling tools use this to name anonymous - * mmap regions. The name is purely advisory. + * Android and memory profiling tools use this to name anonymous mmap + * regions. The name is purely advisory. */ if ((int) x1 == LINUX_PR_SET_VMA_ANON_NAME) return 0; @@ -1168,8 +1208,8 @@ static int64_t sc_openat2(guest_t *g, return -LINUX_EAGAIN; /* For RESOLVE_NO_SYMLINKS, RESOLVE_NO_MAGICLINKS, RESOLVE_BENEATH, - * RESOLVE_IN_ROOT: read the guest path and enforce constraints - * before opening. + * RESOLVE_IN_ROOT: read the guest path and enforce constraints before + * opening. */ if (resolve & (RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS | RESOLVE_BENEATH | RESOLVE_IN_ROOT)) { @@ -1285,8 +1325,8 @@ static int64_t sc_execveat(guest_t *g, hv_vcpu_t vcpu = current_thread->vcpu; int dirfd = (int) x0, flags = (int) x4; - /* Resolve the target path before taking mmap_lock (path resolution - * may call fd_to_host / openat which do not need mmap_lock). + /* Resolve the target path before taking mmap_lock (path resolution may call + * fd_to_host / openat which do not need mmap_lock). */ uint64_t path_gva = x1; char resolved[LINUX_PATH_MAX]; @@ -1534,9 +1574,9 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) goto slow_path; /* Pre-filter: only fast-path fd types that map 1:1 to host - * read/write. This read is racy but benign; if the type - * changed, fd_to_host_dup will either fail or the slow path - * handles it correctly on fallthrough. + * read/write. This read is racy but benign; if the type changed, + * fd_to_host_dup will either fail or the slow path handles it + * correctly on fallthrough. */ int tp = fd_table[fd].type; if (tp != FD_REGULAR && tp != FD_STDIO && tp != FD_PIPE && diff --git a/tests/manifest.txt b/tests/manifest.txt index 17846dc..789acfd 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -49,6 +49,7 @@ test-poll # diff=skip [section] I/O subsystem tests test-eventfd test-signalfd +test-signalfd-hardening test-epoll test-epoll-edge test-timerfd diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index cd5d6d0..39e06a6 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -346,6 +346,8 @@ run_unit_tests() printf "\nI/O subsystem\n" test_check "$runner" "test-eventfd" "0 failed" "$bindir/test-eventfd" test_check "$runner" "test-signalfd" "0 failed" "$bindir/test-signalfd" + test_check "$runner" "test-signalfd-hardening" "0 failed" \ + "$bindir/test-signalfd-hardening" test_check "$runner" "test-epoll" "0 failed" "$bindir/test-epoll" test_check "$runner" "test-epoll-edge" "0 failed" "$bindir/test-epoll-edge" test_check "$runner" "test-timerfd" "0 failed" "$bindir/test-timerfd" diff --git a/tests/test-signalfd-hardening.c b/tests/test-signalfd-hardening.c new file mode 100644 index 0000000..38b7f9d --- /dev/null +++ b/tests/test-signalfd-hardening.c @@ -0,0 +1,871 @@ +/* signalfd read semantics hardening + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Covers: + * 1. RT signal multiplicity: each sigqueue/rt_tgsigqueueinfo enqueues a + * distinct instance with its own si_int payload, returned in FIFO + * order without coalescing. + * 2. Standard signals (1-31) coalesce -- multiple kill()s produce one + * signalfd record (kernel parity). + * 3. ssi_int / ssi_ptr round-trip via sigqueue() (rt_sigqueueinfo) and + * direct rt_tgsigqueueinfo. + * 4. SIGRTMAX (signum 64) is reachable via signalfd (regression for the + * off-by-one that excluded signum == LINUX_NSIG from the collect / + * take loops). + * 5. signalfd's own mask is the only filter -- per-thread blocked mask + * is intentionally not consulted, matching Linux semantics. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#ifndef SYS_rt_tgsigqueueinfo +#define SYS_rt_tgsigqueueinfo 240 +#endif + +#ifndef SYS_rt_sigqueueinfo +#define SYS_rt_sigqueueinfo 138 +#endif + +/* siginfo_t crosses both glibc and musl, but si_value layouts differ. + * Build the kernel-shaped buffer by hand so the test stays libc-agnostic. + */ +static void build_kernel_siginfo(int sig, + int code, + pid_t sender_pid, + uid_t sender_uid, + int payload_int, + void *payload_ptr, + unsigned char out[128]) +{ + memset(out, 0, 128); + int32_t s32; + uint64_t u64; + s32 = sig; + memcpy(out + 0, &s32, 4); + s32 = 0; + memcpy(out + 4, &s32, 4); /* si_errno */ + s32 = code; + memcpy(out + 8, &s32, 4); + /* offset 12 is _pad0 (or part of _sifields alignment). Linux's _sifields + * starts at offset 16 on aarch64; for SI_QUEUE the layout there is: + * si_pid (4) si_uid (4) si_value (8) + */ + s32 = sender_pid; + memcpy(out + 16, &s32, 4); + s32 = sender_uid; + memcpy(out + 20, &s32, 4); + s32 = payload_int; + memcpy(out + 24, &s32, 4); + /* Kernel ignores the upper 4 bytes of si_value's int form, but writes the + * pointer form into the full 8-byte slot at offset 24 for sigval_t. The + * pointer goes into the low 8 bytes so signal_queue_rt() reads either + * representation correctly. + */ + u64 = (uint64_t) (uintptr_t) payload_ptr; + memcpy(out + 24, &u64, 8); + /* If both int and ptr are set, ptr wins because it overlaps. Tests pick + * one or the other. + */ + if (payload_ptr == NULL) { + s32 = payload_int; + memcpy(out + 24, &s32, 4); + } +} + +static int raw_rt_tgsigqueueinfo(pid_t tgid, + pid_t tid, + int sig, + const unsigned char info[128]) +{ + return (int) syscall(SYS_rt_tgsigqueueinfo, tgid, tid, sig, info); +} + +static int raw_rt_sigqueueinfo(pid_t pid, int sig, const void *info) +{ + return (int) syscall(SYS_rt_sigqueueinfo, pid, sig, info); +} + +static void test_rt_multiplicity(void) +{ + TEST("RT multiplicity FIFO + payload"); + + int sig = SIGRTMIN + 1; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + const int payloads[] = {0x1111, 0x2222, 0x3333}; + const int N = sizeof(payloads) / sizeof(payloads[0]); + pid_t pid = getpid(); + for (int i = 0; i < N; i++) { + unsigned char info[128]; + /* SI_QUEUE == -1 is the kernel marker for sigqueue-style payload. */ + build_kernel_siginfo(sig, -1, pid, getuid(), payloads[i], NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + } + + struct signalfd_siginfo buf[4]; + memset(buf, 0, sizeof(buf)); + ssize_t r = read(fd, buf, sizeof(buf)); + close(fd); + + if (r != (ssize_t) (N * sizeof(buf[0]))) { + printf("FAIL: read returned %zd, expected %zu\n", r, + N * sizeof(buf[0])); + fails++; + return; + } + for (int i = 0; i < N; i++) { + if (buf[i].ssi_signo != (uint32_t) sig) { + printf("FAIL: record %d ssi_signo=%u, expected %d\n", i, + buf[i].ssi_signo, sig); + fails++; + return; + } + if (buf[i].ssi_int != payloads[i]) { + printf("FAIL: record %d ssi_int=0x%x, expected 0x%x\n", i, + buf[i].ssi_int, payloads[i]); + fails++; + return; + } + } + PASS(); +} + +static void test_standard_coalesces(void) +{ + TEST("standard signals coalesce"); + + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + /* Three kill()s should produce exactly one signalfd record (Linux + * coalesces standard signals on the pending bitmask). + */ + kill(getpid(), SIGUSR1); + kill(getpid(), SIGUSR1); + kill(getpid(), SIGUSR1); + + struct signalfd_siginfo buf[4]; + memset(buf, 0, sizeof(buf)); + ssize_t r = read(fd, buf, sizeof(buf)); + if (r != (ssize_t) sizeof(buf[0])) { + printf("FAIL: expected one record (%zu bytes), got %zd\n", + sizeof(buf[0]), r); + close(fd); + fails++; + return; + } + if (buf[0].ssi_signo != (uint32_t) SIGUSR1) { + printf("FAIL: ssi_signo=%u\n", buf[0].ssi_signo); + close(fd); + fails++; + return; + } + /* Second read drains nothing -- pending bit cleared. */ + errno = 0; + ssize_t r2 = read(fd, buf, sizeof(buf)); + close(fd); + if (r2 != -1 || errno != EAGAIN) { + FAIL("expected EAGAIN on follow-up read"); + return; + } + PASS(); +} + +static void test_sigrtmax_reachable(void) +{ + /* SIGRTMAX (64 on aarch64) was excluded by an off-by-one in the + * collect/take loops (signum < LINUX_NSIG instead of <= LINUX_NSIG). + * This test fails before the fix and passes after. + */ + TEST("SIGRTMAX reaches signalfd"); + + int sig = SIGRTMAX; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + pid_t pid = getpid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), 0xCAFEBABE, NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo SIGRTMAX"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + printf("FAIL: read returned %zd\n", r); + fails++; + return; + } + if (rec.ssi_signo != (uint32_t) sig || + rec.ssi_int != (int32_t) 0xCAFEBABE) { + printf("FAIL: signo=%u int=0x%x\n", rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_ssi_ptr_roundtrip(void) +{ + /* sigval has separate int and ptr forms. For the ptr form the full 64 + * bits land in si_value; signalfd_siginfo exposes both ssi_int (low 32) + * and ssi_ptr (full 64). Verify both are populated from one queued ptr. + */ + TEST("ssi_ptr / ssi_int round-trip"); + + int sig = SIGRTMIN + 2; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + /* Use an arbitrary pointer-shaped value with a high bit set so a + * truncating implementation drops information detectably. + */ + void *payload = (void *) 0x0123456789ABCDEFULL; + pid_t pid = getpid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), 0, payload, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_ptr != (uint64_t) (uintptr_t) payload) { + printf("FAIL: ssi_ptr=0x%llx, expected 0x%llx\n", + (unsigned long long) rec.ssi_ptr, + (unsigned long long) (uintptr_t) payload); + fails++; + return; + } + /* ssi_int aliases the low 32 bits of the same union. */ + if (rec.ssi_int != (int32_t) (uintptr_t) payload) { + printf("FAIL: ssi_int=0x%x\n", rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_sender_metadata(void) +{ + /* Verify ssi_pid / ssi_uid carry the sender values supplied via + * rt_tgsigqueueinfo's siginfo (Linux-style SI_QUEUE: caller fills + * si_pid/si_uid; kernel does not override for negative si_code). + */ + TEST("ssi_pid / ssi_uid from sender"); + + int sig = SIGRTMIN + 3; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + pid_t pid = getpid(); + uid_t uid = getuid(); + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, uid, 0x55AA, NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_pid != (uint32_t) pid || rec.ssi_uid != uid) { + printf("FAIL: ssi_pid=%u (want %d), ssi_uid=%u (want %u)\n", + rec.ssi_pid, pid, rec.ssi_uid, uid); + fails++; + return; + } + if (rec.ssi_code != -1) { + printf("FAIL: ssi_code=%d (want -1 SI_QUEUE)\n", rec.ssi_code); + fails++; + return; + } + PASS(); +} + +static void test_mask_filters_only(void) +{ + /* signalfd's own mask is the sole filter: a signal blocked from + * synchronous delivery via sigprocmask is still readable from the + * signalfd if its mask includes the signal. + */ + TEST("signalfd mask filters, not pthread mask"); + + sigset_t pblock; + sigemptyset(&pblock); + sigaddset(&pblock, SIGUSR1); + sigaddset(&pblock, SIGUSR2); + sigprocmask(SIG_BLOCK, &pblock, NULL); + + /* signalfd only watches SIGUSR1. SIGUSR2 stays pending in the process + * pending set after kill(), but must not appear in the read result. + */ + sigset_t fdmask; + sigemptyset(&fdmask); + sigaddset(&fdmask, SIGUSR1); + + int fd = signalfd(-1, &fdmask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + kill(getpid(), SIGUSR2); + kill(getpid(), SIGUSR1); + + struct signalfd_siginfo rec[4]; + memset(rec, 0, sizeof(rec)); + ssize_t r = read(fd, rec, sizeof(rec)); + if (r != (ssize_t) sizeof(rec[0])) { + printf("FAIL: expected one record, got %zd\n", r); + close(fd); + fails++; + /* Drain SIGUSR2 to keep state clean for later tests. */ + sigset_t draino; + sigemptyset(&draino); + sigaddset(&draino, SIGUSR2); + int tmp = signalfd(-1, &draino, SFD_NONBLOCK); + if (tmp >= 0) { + (void) read(tmp, rec, sizeof(rec)); + close(tmp); + } + return; + } + if (rec[0].ssi_signo != (uint32_t) SIGUSR1) { + printf("FAIL: got signo=%u, expected SIGUSR1\n", rec[0].ssi_signo); + close(fd); + fails++; + return; + } + close(fd); + + /* SIGUSR2 must still be pending -- prove by widening mask and reading. */ + sigaddset(&fdmask, SIGUSR2); + int fd2 = signalfd(-1, &fdmask, SFD_NONBLOCK); + if (fd2 < 0) { + FAIL("signalfd 2"); + return; + } + memset(rec, 0, sizeof(rec)); + r = read(fd2, rec, sizeof(rec)); + close(fd2); + if (r != (ssize_t) sizeof(rec[0]) || + rec[0].ssi_signo != (uint32_t) SIGUSR2) { + printf("FAIL: SIGUSR2 not pending after first read (r=%zd)\n", r); + fails++; + return; + } + PASS(); +} + +static void test_sigqueue_libc_path(void) +{ + /* glibc / musl sigqueue() goes through SYS_rt_sigqueueinfo (138). + * Without that wired in, sigqueue() returns ENOSYS and apps that rely + * on POSIX queued signals (real-time apps, gdb) break. Verify the + * libc path produces a payload-bearing record. + */ + TEST("libc sigqueue() round-trip"); + + int sig = SIGRTMIN + 4; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + union sigval sv; + sv.sival_int = 0x4242; + if (sigqueue(getpid(), sig, sv) != 0) { + close(fd); + FAIL("sigqueue"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_signo != (uint32_t) sig || rec.ssi_int != 0x4242) { + printf("FAIL: signo=%u int=0x%x\n", rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +static void test_sigqueue_standard_metadata(void) +{ + TEST("standard sigqueue() keeps metadata"); + + int sig = SIGUSR1; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + union sigval sv; + sv.sival_int = 0x5151; + if (sigqueue(getpid(), sig, sv) != 0) { + close(fd); + FAIL("sigqueue std"); + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t r = read(fd, &rec, sizeof(rec)); + close(fd); + if (r != (ssize_t) sizeof(rec)) { + FAIL("read short"); + return; + } + if (rec.ssi_signo != (uint32_t) sig || rec.ssi_int != 0x5151 || + rec.ssi_code != SI_QUEUE || rec.ssi_pid != (uint32_t) getpid() || + rec.ssi_uid != (uint32_t) getuid()) { + printf("FAIL: signo=%u int=0x%x code=%d pid=%u uid=%u\n", rec.ssi_signo, + rec.ssi_int, rec.ssi_code, rec.ssi_pid, rec.ssi_uid); + fails++; + return; + } + PASS(); +} + +static void test_partial_fault_returns_partial_bytes(void) +{ + /* Partial-fault recovery (write-then-take semantics). + * + * Queue four RT signals (payloads 0xA1..0xA4). Place a 4-record buffer + * so records 0 and 1 land in a valid page but records 2 and 3 cross + * into an unmapped page. The bridge writes 2 records, hits EFAULT + * trying to write record 2, returns partial bytes (2 * 128) -- and + * crucially does NOT take records 2 and 3 from the rt-queue, so they + * remain pending in original FIFO order. The follow-up read returns + * exactly two records with payloads 0xA3 then 0xA4 (no duplication + * of 0xA1 / 0xA2; no re-queue path that could overflow RT_SIGQUEUE_MAX + * or desync the notification pipe). + */ + TEST("partial fault: partial bytes + FIFO"); + + int sig = SIGRTMIN + 5; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + long page = sysconf(_SC_PAGESIZE); + if (page <= 0) + page = 4096; + void *region = mmap(NULL, page * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (region == MAP_FAILED) { + close(fd); + FAIL("mmap guard region"); + return; + } + if (munmap((char *) region + page, page) != 0) { + munmap(region, page); + close(fd); + FAIL("munmap guard"); + return; + } + + pid_t pid = getpid(); + const int payloads[] = {0xA1, 0xA2, 0xA3, 0xA4}; + const int N = 4; + for (int i = 0; i < N; i++) { + unsigned char info[128]; + build_kernel_siginfo(sig, -1, pid, getuid(), payloads[i], NULL, info); + if (raw_rt_tgsigqueueinfo(pid, pid, sig, info) != 0) { + munmap(region, page); + close(fd); + FAIL("rt_tgsigqueueinfo"); + return; + } + } + + char *buf = (char *) region + page - (2 * 128); + errno = 0; + ssize_t r = read(fd, buf, 4 * sizeof(struct signalfd_siginfo)); + if (r != (ssize_t) (2 * sizeof(struct signalfd_siginfo))) { + printf("FAIL: expected 256 partial bytes, got r=%zd errno=%d\n", r, + errno); + munmap(region, page); + close(fd); + fails++; + return; + } + + struct signalfd_siginfo *delivered = (struct signalfd_siginfo *) buf; + if (delivered[0].ssi_signo != (uint32_t) sig || + delivered[0].ssi_int != payloads[0] || + delivered[1].ssi_signo != (uint32_t) sig || + delivered[1].ssi_int != payloads[1]) { + munmap(region, page); + close(fd); + printf("FAIL: page 1 records not [0x%x,0x%x]: got [0x%x,0x%x]\n", + payloads[0], payloads[1], delivered[0].ssi_int, + delivered[1].ssi_int); + fails++; + return; + } + munmap(region, page); + + /* Follow-up read into a fully-valid buffer. + * + * Linux dequeues the record being copied before checking copy_to_user, + * so the record that hit EFAULT (payloads[2]) is lost; a follow-up + * read returns one record (payloads[3]). elfuse defers the take until + * the write succeeds, so a follow-up read returns two records + * (payloads[2] then payloads[3]) in original FIFO order. + * + * Both behaviors are accepted: the contract under test is "no + * duplication of records that already reached the guest, no + * out-of-order delivery within whatever survives, and the last + * queued payload is always preserved." + */ + struct signalfd_siginfo recs[8]; + memset(recs, 0, sizeof(recs)); + ssize_t r2 = read(fd, recs, sizeof(recs)); + close(fd); + size_t recs_returned = (size_t) r2 / sizeof(recs[0]); + bool linux_loose = + (r2 == (ssize_t) sizeof(recs[0])) && recs[0].ssi_int == payloads[3]; + bool elfuse_strict = (r2 == (ssize_t) (2 * sizeof(recs[0]))) && + recs[0].ssi_int == payloads[2] && + recs[1].ssi_int == payloads[3]; + if (!linux_loose && !elfuse_strict) { + printf( + "FAIL: follow-up read returned %zd bytes (%zu records); " + "first=0x%x second=0x%x; expected either [0x%x] or [0x%x,0x%x]\n", + r2, recs_returned, recs[0].ssi_int, recs[1].ssi_int, payloads[3], + payloads[2], payloads[3]); + fails++; + return; + } + PASS(); +} + +static void test_rt_sigqueueinfo_bad_pointer_efault(void) +{ + TEST("rt_sigqueueinfo unreadable siginfo faults"); + + int sig = SIGRTMIN + 6; + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, NULL); + + int fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + FAIL("signalfd"); + return; + } + + errno = 0; + int ret = raw_rt_sigqueueinfo(getpid(), sig, (const void *) 1); + if (ret != -1 || errno != EFAULT) { + printf("FAIL: rt_sigqueueinfo unreadable info ret=%d errno=%d\n", ret, + errno); + close(fd); + fails++; + return; + } + + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + errno = 0; + ssize_t r = read(fd, &rec, sizeof(rec)); + if (r != -1 || errno != EAGAIN) { + printf("FAIL: bad rt_sigqueueinfo queued a signal r=%d errno=%d\n", + (int) r, errno); + close(fd); + fails++; + return; + } + + close(fd); + PASS(); +} + +static void test_rt_sigqueueinfo_rejects_foreign_pid(void) +{ + /* rt_sigqueueinfo is a process-scoped (tgid) syscall. A pid that does + * not name the current process must return ESRCH instead of routing + * the signal through whichever thread happened to share the numeric + * id. The first probe picks a pid the host kernel cannot have + * assigned to the current guest so the call cannot collide with a + * legitimate target. + */ + TEST("rt_sigqueueinfo rejects foreign pid"); + + unsigned char info[128]; + build_kernel_siginfo(SIGRTMIN, -1, getpid(), getuid(), 0xDEAD, NULL, info); + + errno = 0; + int ret = raw_rt_sigqueueinfo(0x7FFFFFFE, SIGRTMIN, info); + if (ret != -1 || errno != ESRCH) { + printf("FAIL: foreign pid: ret=%d errno=%d (expected ESRCH)\n", ret, + errno); + fails++; + return; + } + PASS(); +} + +/* Helpers for the worker-thread tid case. The worker publishes its own + * tid via a thread-shared variable, then waits on a barrier so the main + * thread can call rt_sigqueueinfo with that tid before the worker exits. + */ +typedef struct { + pthread_mutex_t mtx; + pthread_cond_t ready_cv; + pthread_cond_t go_cv; + pid_t worker_tid; + bool ready; + bool go; +} worker_sync_t; + +static void *tid_worker(void *arg) +{ + worker_sync_t *s = arg; + pthread_mutex_lock(&s->mtx); + s->worker_tid = (pid_t) syscall(SYS_gettid); + s->ready = true; + pthread_cond_signal(&s->ready_cv); + while (!s->go) + pthread_cond_wait(&s->go_cv, &s->mtx); + pthread_mutex_unlock(&s->mtx); + return NULL; +} + +static void test_rt_sigqueueinfo_thread_tid_routes_to_tgid(void) +{ + /* Linux is permissive: rt_sigqueueinfo(tid_of_any_thread, ...) + * succeeds and the signal lands in the thread group's pending set + * (kill_pid_info routes through PIDTYPE_TGID). The contract under + * test is that elfuse matches that routing: a worker thread tid is + * accepted, and the queued signal becomes readable from the process + * signalfd. A regression that scoped the syscall to "tgid only" + * would surface here as ESRCH. + */ + TEST("rt_sigqueueinfo tid routes to tgid"); + + /* Block SIGRTMIN process-wide so the queued signal stays pending + * for signalfd to read instead of terminating the process. + */ + sigset_t block; + sigemptyset(&block); + sigaddset(&block, SIGRTMIN); + sigprocmask(SIG_BLOCK, &block, NULL); + + worker_sync_t s; + pthread_mutex_init(&s.mtx, NULL); + pthread_cond_init(&s.ready_cv, NULL); + pthread_cond_init(&s.go_cv, NULL); + s.worker_tid = -1; + s.ready = false; + s.go = false; + + pthread_t th; + if (pthread_create(&th, NULL, tid_worker, &s) != 0) { + FAIL("pthread_create"); + return; + } + + pthread_mutex_lock(&s.mtx); + while (!s.ready) + pthread_cond_wait(&s.ready_cv, &s.mtx); + pid_t worker_tid = s.worker_tid; + pthread_mutex_unlock(&s.mtx); + + if (worker_tid == getpid()) { + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + FAIL("worker tid equals process pid"); + return; + } + + int sfd_fd = signalfd(-1, &block, SFD_NONBLOCK); + if (sfd_fd < 0) { + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + FAIL("signalfd"); + return; + } + + unsigned char info[128]; + build_kernel_siginfo(SIGRTMIN, -1, getpid(), getuid(), 0xBEEF, NULL, info); + + errno = 0; + int ret = raw_rt_sigqueueinfo(worker_tid, SIGRTMIN, info); + int err = errno; + + /* Drain any queued signal via signalfd before letting the worker + * exit so the signal does not leak into pthread_join. + */ + struct signalfd_siginfo rec; + memset(&rec, 0, sizeof(rec)); + ssize_t got = -1; + int got_err = 0; + if (ret == 0) { + errno = 0; + got = read(sfd_fd, &rec, sizeof(rec)); + got_err = errno; + } + close(sfd_fd); + + pthread_mutex_lock(&s.mtx); + s.go = true; + pthread_cond_signal(&s.go_cv); + pthread_mutex_unlock(&s.mtx); + pthread_join(th, NULL); + pthread_mutex_destroy(&s.mtx); + pthread_cond_destroy(&s.ready_cv); + pthread_cond_destroy(&s.go_cv); + + if (ret != 0) { + printf("FAIL: worker tid %d: ret=%d errno=%d (expected 0)\n", + (int) worker_tid, ret, err); + fails++; + return; + } + if (got != (ssize_t) sizeof(rec) || rec.ssi_signo != (uint32_t) SIGRTMIN || + rec.ssi_int != 0xBEEF) { + printf("FAIL: signalfd read got=%zd errno=%d signo=%u int=0x%x\n", got, + got_err, rec.ssi_signo, rec.ssi_int); + fails++; + return; + } + PASS(); +} + +int main(void) +{ + printf("test-signalfd-hardening: signalfd read semantics audit\n"); + + test_rt_multiplicity(); + test_standard_coalesces(); + test_sigrtmax_reachable(); + test_ssi_ptr_roundtrip(); + test_sender_metadata(); + test_mask_filters_only(); + test_sigqueue_libc_path(); + test_sigqueue_standard_metadata(); + test_partial_fault_returns_partial_bytes(); + test_rt_sigqueueinfo_bad_pointer_efault(); + test_rt_sigqueueinfo_rejects_foreign_pid(); + test_rt_sigqueueinfo_thread_tid_routes_to_tgid(); + + SUMMARY("test-signalfd-hardening"); + return fails > 0 ? 1 : 0; +}