Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@ $(BUILD_DIR)/test-pthread: tests/test-pthread.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

# test-signalfd-hardening needs -lpthread for the worker-thread tid
# regression case in test_rt_sigqueueinfo_rejects_thread_tid.
$(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_DIR)
@echo " CROSS $< (with -lpthread)"
$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread

endif

include mk/tests.mk
Expand Down
1 change: 1 addition & 0 deletions src/syscall/abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#define SYS_rt_sigaction 134
#define SYS_rt_sigprocmask 135
#define SYS_rt_sigpending 136
#define SYS_rt_sigqueueinfo 138
#define SYS_rt_sigreturn 139
#define SYS_setpriority 140
#define SYS_getpriority 141
Expand Down
1 change: 1 addition & 0 deletions src/syscall/dispatch.tbl
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ SYS_rt_sigaction sc_rt_sigaction 1
SYS_rt_sigprocmask sc_rt_sigprocmask 1
SYS_rt_sigpending sc_rt_sigpending 0
SYS_rt_sigreturn sc_rt_sigreturn 1
SYS_rt_sigqueueinfo sc_rt_sigqueueinfo 1
SYS_rt_tgsigqueueinfo sc_rt_tgsigqueueinfo 1

# Time and timers
Expand Down
71 changes: 60 additions & 11 deletions src/syscall/fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -885,15 +885,31 @@ int64_t sys_signalfd4(guest_t *g,
return gfd;
}

/* Read from signalfd: consume pending signals matching the mask.
* Each signal produces one signalfd_siginfo (128 bytes).
* Returns number of bytes read, or -EAGAIN if nothing pending.
/* Read from signalfd: consume pending signals matching the signalfd's mask.
*
* Each signal produces one signalfd_siginfo (128 bytes). RT signals (32-64)
* are queued: each sigqueue/rt_tgsigqueueinfo enqueues a distinct instance with
* its own si_int/si_ptr payload, and signalfd_read returns them in FIFO order
* without coalescing (Linux behavior).
*
* Per-thread signal mask is intentionally not consulted: signalfd is the
* standard mechanism for reading signals that were blocked from synchronous
* delivery via sigprocmask(). The signalfd's own mask (set at create time or
* via signalfd(fd, &mask, ...)) is the only filter applied.
*
* ssi_int/ssi_ptr are populated from queued metadata when present.
* Standard signals (1-31) still coalesce to one pending instance, but Linux
* preserves one siginfo payload for that instance.
*
* Returns the number of bytes read (multiple of sizeof(signalfd_siginfo)), or
* -EAGAIN if nothing pending and the fd is non-blocking.
*/
int64_t signalfd_read(int guest_fd,
guest_t *g,
uint64_t buf_gva,
uint64_t count)
{
retry:
/* Capture slot state under sfd_lock, then release BEFORE calling
* signal_get_state() which acquires sig_lock(4). Holding sfd_lock(5a)
* while taking sig_lock(4) would violate lock ordering.
Expand Down Expand Up @@ -963,10 +979,21 @@ int64_t signalfd_read(int guest_fd,
if (deliverable == 0)
goto no_pending;
}
total = signal_peek_signalfd(mask, pending, max_signals);
if (total == 0)
size_t peeked = signal_peek_signalfd(mask, pending, max_signals);
if (peeked == 0)
goto no_pending;
for (size_t i = 0; i < total; i++) {

/* Write-then-take. Writing first means that on a guest_write_small EFAULT
* the rt-queue is still intact and signals are not lost: no re-queue dance,
* no RT_SIGQUEUE_MAX overflow window, no extra signalfd_notify writes that
* would desync the pipe-byte count from the actual pending-signal count.
* Take only the prefix the writer landed; if a concurrent consumer advanced
* the rt-queue head between peek and take, take returns less than the
* written count and the bridge restarts the read loop via the retry label
* below.
*/
size_t written = 0;
for (size_t i = 0; i < peeked; i++) {
linux_signalfd_siginfo_t info;
memset(&info, 0, sizeof(info));
info.ssi_signo = (uint32_t) pending[i].signum;
Expand All @@ -978,12 +1005,34 @@ int64_t signalfd_read(int guest_fd,

uint64_t off = i * sizeof(linux_signalfd_siginfo_t);
if (guest_write_small(g, buf_gva + off, &info, sizeof(info)) < 0) {
if (pending != pending_stack)
free(pending);
return -LINUX_EFAULT;
if (written == 0) {
/* No bytes transferred: surface EFAULT, leave the queue
* untouched so the signal is not lost. Matches the elfuse
* promise locked in by tests/test-tier-b's
* test_signalfd_efault_preserves_pending.
*/
if (pending != pending_stack)
free(pending);
return -LINUX_EFAULT;
}

/* Partial success: stop writing and let take consume only the
* delivered prefix. The unwritten entries stay in the rt-queue
* naturally because the take call has not run yet.
*/
break;
}
written++;
}

total = signal_take_signalfd_exact(pending, written);
if (total == 0) {
if (written == 0)
goto no_pending;
if (pending != pending_stack)
free(pending);
goto retry;
}
total = signal_take_signalfd_exact(pending, total);

/* Drain pipe: consume exactly one byte per signal read. If the code drains
* ALL bytes, the code would lose notifications for signals that arrived
Expand All @@ -998,7 +1047,7 @@ int64_t signalfd_read(int guest_fd,

if (pending != pending_stack)
free(pending);
return (int64_t) (total * sizeof(linux_signalfd_siginfo_t));
return (int64_t) total * (int64_t) sizeof(linux_signalfd_siginfo_t);

no_pending:
if (pending != pending_stack)
Expand Down
107 changes: 75 additions & 32 deletions src/syscall/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
* Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
* SPDX-License-Identifier: Apache-2.0
*
* Implements Linux-compatible signal delivery for aarch64 guests. When a
* signal is queued (e.g., SIGPIPE from write() to broken pipe), signal
* emulation builds an rt_sigframe on the guest stack matching the kernel's
* setup_rt_frame() layout, then redirects the vCPU to the guest's signal
* handler. The guest handler eventually calls rt_sigreturn (SYS 139), which
* restores the saved register state from the frame.
* Implements Linux-compatible signal delivery for aarch64 guests. When a signal
* is queued (e.g., SIGPIPE from write() to broken pipe), signal emulation
* builds an rt_sigframe on the guest stack matching the kernel's setup_rt_frame
* layout, then redirects the vCPU to the guest's signal handler. The guest
* handler eventually calls rt_sigreturn (SYS 139), which restores the saved
* register state from the frame.
*
* Reference: Linux arch/arm64/kernel/signal.c
*/
Expand Down Expand Up @@ -161,17 +161,43 @@ static inline int sig_uncatchable(int signum)
return signum == LINUX_SIGKILL || signum == LINUX_SIGSTOP;
}

static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info)
static signal_rt_info_t signal_default_info(int signum)
{
int idx = signum - LINUX_SIGRTMIN;
signal_rt_info_t fallback = {
return (signal_rt_info_t) {
.signum = signum,
.si_code = LINUX_SI_USER,
.si_pid = (int32_t) proc_get_pid(),
.si_uid = proc_get_uid(),
.si_int = 0,
.si_ptr = 0,
};
}

static void signal_standard_enqueue_locked(int signum,
const signal_rt_info_t *info)
{
int idx = signum - 1;
uint64_t bit = sig_bit(signum);

if (!(sig_state.pending & bit)) {
sig_state.std_info[idx] = info ? *info : signal_default_info(signum);
sig_state.std_info_valid[idx] = info != NULL;
}
sig_state.pending |= bit;
}

static signal_rt_info_t signal_standard_peek_locked(int signum)
{
int idx = signum - 1;
if (sig_state.std_info_valid[idx])
return sig_state.std_info[idx];
return signal_default_info(signum);
}

static void signal_rt_enqueue_locked(int signum, const signal_rt_info_t *info)
{
int idx = signum - LINUX_SIGRTMIN;
signal_rt_info_t fallback = signal_default_info(signum);
const signal_rt_info_t *entry = info ? info : &fallback;

sig_state.pending |= sig_bit(signum);
Expand Down Expand Up @@ -279,9 +305,10 @@ void signal_queue(int signum)
if (signum < 1 || signum > LINUX_NSIG)
return;
pthread_mutex_lock(&sig_lock);
sig_state.pending |= sig_bit(signum);
if (signum >= LINUX_SIGRTMIN)
signal_rt_enqueue_locked(signum, NULL);
else
signal_standard_enqueue_locked(signum, NULL);
/* Publish hint before releasing lock so vCPU hot path sees it. */
atomic_store_explicit(&sig_pending_hint, sig_state.pending,
memory_order_release);
Expand Down Expand Up @@ -317,7 +344,17 @@ void signal_queue_rt(int signum,
int32_t si_int,
uint64_t si_ptr)
{
if (signum < LINUX_SIGRTMIN || signum > LINUX_NSIG)
signal_queue_info(signum, si_code, si_pid, si_uid, si_int, si_ptr);
}

void signal_queue_info(int signum,
int32_t si_code,
int32_t si_pid,
uint32_t si_uid,
int32_t si_int,
uint64_t si_ptr)
{
if (signum < 1 || signum > LINUX_NSIG)
return;
pthread_mutex_lock(&sig_lock);
signal_rt_info_t info = {
Expand All @@ -328,7 +365,10 @@ void signal_queue_rt(int signum,
.si_int = si_int,
.si_ptr = si_ptr,
};
signal_rt_enqueue_locked(signum, &info);
if (signum >= LINUX_SIGRTMIN)
signal_rt_enqueue_locked(signum, &info);
else
signal_standard_enqueue_locked(signum, &info);
atomic_store_explicit(&sig_pending_hint, sig_state.pending,
memory_order_release);
pthread_mutex_unlock(&sig_lock);
Expand Down Expand Up @@ -416,7 +456,12 @@ static size_t signal_collect_signalfd(uint64_t mask,

pthread_mutex_lock(&sig_lock);
uint64_t deliverable = sig_state.pending & mask;
for (int signum = 1; signum < LINUX_NSIG && total < max; signum++) {
/* signum runs 1..LINUX_NSIG inclusive (64 is the highest valid RT signal
* on aarch64 Linux). Bare-musl applications can target SIGRTMAX directly,
* so the inclusive bound matters even though glibc reserves the top of the
* RT range for itself.
*/
for (int signum = 1; signum <= LINUX_NSIG && total < max; signum++) {
uint64_t bit = BIT64(signum - 1);
if (!(deliverable & bit))
continue;
Expand Down Expand Up @@ -446,14 +491,9 @@ static size_t signal_collect_signalfd(uint64_t mask,
total++;
}
} else {
signal_rt_info_t info = {
.signum = signum,
.si_code = LINUX_SI_USER,
.si_pid = (int32_t) proc_get_pid(),
.si_uid = proc_get_uid(),
.si_int = 0,
.si_ptr = 0,
};
signal_rt_info_t info = signal_standard_peek_locked(signum);
if (consume)
sig_state.std_info_valid[signum - 1] = false;
if (consume)
sig_state.pending &= ~bit;
if (out)
Expand Down Expand Up @@ -482,7 +522,7 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max)
pthread_mutex_lock(&sig_lock);
for (; total < max; total++) {
int signum = expected[total].signum;
if (signum <= 0 || signum >= LINUX_NSIG)
if (signum <= 0 || signum > LINUX_NSIG)
break;

uint64_t bit = sig_bit(signum);
Expand All @@ -508,6 +548,15 @@ size_t signal_take_signalfd_exact(const signal_rt_info_t *expected, size_t max)
continue;
}

signal_rt_info_t current = signal_standard_peek_locked(signum);
const signal_rt_info_t *want = &expected[total];
if (current.signum != want->signum ||
current.si_code != want->si_code ||
current.si_pid != want->si_pid || current.si_uid != want->si_uid ||
current.si_int != want->si_int || current.si_ptr != want->si_ptr)
break;

sig_state.std_info_valid[signum - 1] = false;
sig_state.pending &= ~bit;
}
atomic_store_explicit(&sig_pending_hint, sig_state.pending,
Expand Down Expand Up @@ -1107,14 +1156,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)

/* Find lowest pending unblocked signal */
int signum = bit_ctz64(deliverable) + 1;
signal_rt_info_t rt_info = {
.signum = signum,
.si_code = LINUX_SI_USER,
.si_pid = (int32_t) proc_get_pid(),
.si_uid = proc_get_uid(),
.si_int = 0,
.si_ptr = 0,
};
signal_rt_info_t rt_info = signal_default_info(signum);

/* Dequeue: for RT signals, decrement count and only clear the
* pending bit when the queue is empty. Standard signals are
Expand All @@ -1123,6 +1165,8 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
if (signum >= LINUX_SIGRTMIN) {
signal_rt_dequeue_locked(signum, &rt_info);
} else {
rt_info = signal_standard_peek_locked(signum);
sig_state.std_info_valid[signum - 1] = false;
sig_state.pending &= ~sig_bit(signum);
}

Expand Down Expand Up @@ -1210,8 +1254,7 @@ int signal_deliver(hv_vcpu_t vcpu, guest_t *g, int *exit_code)
frame.info.si_code = rt_info.si_code;
frame.info.si_pid = rt_info.si_pid;
frame.info.si_uid = (int32_t) rt_info.si_uid;
if (signum >= LINUX_SIGRTMIN)
frame.info.si_value = rt_info.si_ptr;
frame.info.si_value = rt_info.si_ptr;
}

/* ucontext: embed a per-delivery cookie in uc_flags for SROP
Expand Down
19 changes: 17 additions & 2 deletions src/syscall/signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,16 +184,21 @@ typedef struct {
bool saved_blocked_valid; /* True if saved_blocked is set */
linux_stack_t altstack; /* Alternate signal stack (sigaltstack) */
bool on_altstack; /* True if currently delivering on altstack */
/* Standard signal metadata: Linux coalesces signals 1-31, but preserves one
* siginfo payload for the pending instance.
*/
bool std_info_valid[LINUX_SIGRTMIN - 1];
signal_rt_info_t std_info[LINUX_SIGRTMIN - 1];
/* RT signal queue: count of pending instances per signal.
* Standard signals (1-31) use only the pending bitmask (coalesced).
* Standard signals (1-31) use the pending bitmask plus std_info[].
* RT signals (32-64) are queued: each instance is tracked separately.
*/
int rt_queue[RT_SIGNAL_COUNT];
uint8_t rt_head[RT_SIGNAL_COUNT];
signal_rt_info_t rt_info[RT_SIGNAL_COUNT][RT_SIGQUEUE_MAX];
} signal_state_t;

/* API. */
/* API */

/* Initialize signal state: all SIG_DFL, nothing pending/blocked. */
void signal_init(void);
Expand All @@ -215,6 +220,16 @@ void signal_queue_rt(int signum,
int32_t si_int,
uint64_t si_ptr);

/* Queue a signal with explicit siginfo metadata. Standard signals preserve
* one payload while coalesced; RT signals enqueue every instance.
*/
void signal_queue_info(int signum,
int32_t si_code,
int32_t si_pid,
uint32_t si_uid,
int32_t si_int,
uint64_t si_ptr);

/* Set fault info for the next signal delivery. When set, signal_deliver()
* populates si_code, si_addr, fault_address, and ESR context from these
* values instead of using the default SI_USER/si_pid fields. Consumed
Expand Down
Loading
Loading