From 1423b4a70b4dddd3387364d74160fdc9855e4585 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Tue, 5 May 2026 15:21:03 +0800 Subject: [PATCH] Honor futex_waitv clockid and wake without polling futex_waitv (SYS 449) is SYSCALL_DEFINE5 in the kernel: (waiters, nr_futexes, flags, timeout, clockid). The dispatch wrapper sc_futex_waitv was forwarding only x0..x3, dropping x4. The implementation hardcoded a CLOCK_MONOTONIC->CLOCK_REALTIME deadline conversion, so a guest asking for CLOCK_REALTIME got monotonic semantics regardless of what it passed. Wire x4 through and give sys_futex_waitv a clockid parameter. Validate against CLOCK_REALTIME or CLOCK_MONOTONIC when timeout is non-NULL (Linux returns EINVAL otherwise) and branch the absolute-deadline conversion on clockid. The pre-existing waitv loop slept on a private cond that no wake site ever signalled; forward progress relied on a 50ms poll. Add optional group_lock / group_cond pointers to futex_waiter_t plus a futex_waiter_notify_group helper. Each wake site (futex_wake, futex_requeue, futex_wake_op both passes, futex_unlock_pi) calls it after marking woken=1 under the bucket lock. waitv blocks on shared.cond directly; the loop's bounded sleep is now 500ms, kept only so exit_group and timeout edges are still observed when no signal arrives. Lock ordering is bucket -> group_lock; futex_waiter_notify_group is only called by wake sites that already hold the bucket lock, and the waitv thread never holds shared.lock while taking a bucket lock. Stack lifetime of shared.lock and shared.cond is protected by the bucket-lock pairing in waitv_unlink, which synchronizes with every wake's notify_group before the destroys run. Locked in by tests/test-futex-waitv.c (19 cases): single-element wake returning index 0, multi-element wake returning the woken index, EAGAIN on stale val, eight EINVAL paths covering nr=0, nr>128, top-level flags, reserved!=0, element flags reserved bits, size!=U32, malformed CLOCK_REALTIME / CLOCK_MONOTONIC nsec, unaligned uaddr, NULL waiters_gva, bad clockid with timeout; two EFAULT paths via PROT_NONE pages; the inclusive nr_futexes==128 boundary; and ETIMEDOUT under both CLOCK_MONOTONIC and CLOCK_REALTIME deadlines. The CLOCK_REALTIME case exists specifically to catch a regression of the dropped-clockid bug. Verified against Linux ground truth via tests/qemu-runner.sh; all 19 cases match. --- Makefile | 6 + src/runtime/futex.c | 263 +++++++++++++++++------- src/runtime/futex.h | 5 +- src/syscall/syscall.c | 3 +- tests/manifest.txt | 3 + tests/test-futex-waitv.c | 418 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 623 insertions(+), 75 deletions(-) create mode 100644 tests/test-futex-waitv.c diff --git a/Makefile b/Makefile index 2895a93..8690e60 100644 --- a/Makefile +++ b/Makefile @@ -153,6 +153,12 @@ $(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_ @echo " CROSS $< (with -lpthread)" $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread +# test-futex-waitv needs -lpthread for the host wake-thread used to unblock +# the main thread's futex_waitv. +$(BUILD_DIR)/test-futex-waitv: tests/test-futex-waitv.c | $(BUILD_DIR) + @echo " CROSS $< (with -lpthread)" + $(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread + endif include mk/tests.mk diff --git a/src/runtime/futex.c b/src/runtime/futex.c index ca00358..0d84979 100644 --- a/src/runtime/futex.c +++ b/src/runtime/futex.c @@ -79,6 +79,10 @@ static _Atomic int futex_interrupt_requested = 0; /* Per-waiter node. Allocated on the host stack of the waiting thread (no malloc * needed; the waiter is stack-local to sys_futex). + * + * group_lock / group_cond are optional: when non-NULL, a wake additionally + * signals group_cond under group_lock. futex_waitv uses this so that any wake + * across the wait set unblocks the polling thread without per-bucket polling. */ typedef struct futex_waiter { uint64_t uaddr; /* Guest VA being waited on */ @@ -86,8 +90,23 @@ typedef struct futex_waiter { pthread_cond_t cond; /* Signalled by WAKE to unblock this waiter */ int woken; /* Set to 1 by WAKE before signalling */ struct futex_waiter *next; /* Next waiter in same bucket */ + pthread_mutex_t *group_lock; + pthread_cond_t *group_cond; } futex_waiter_t; +/* If the waiter belongs to a futex_waitv group, signal the group's cond so the + * polling thread wakes immediately. Caller holds the bucket lock; group_lock is + * acquired below it (lock order: bucket -> group_lock). + */ +static void futex_waiter_notify_group(futex_waiter_t *w) +{ + if (!w->group_cond) + return; + pthread_mutex_lock(w->group_lock); + pthread_cond_signal(w->group_cond); + pthread_mutex_unlock(w->group_lock); +} + /* One bucket in the hash table. Protected by its own mutex. * Lock order: 7 (leaf locks, index-ordered when two acquired). */ @@ -146,10 +165,29 @@ int futex_interrupt_pending(void) return atomic_load(&futex_interrupt_requested); } +/* Cap on guest-supplied tv_sec. The cap exists purely so the int64_t / time_t + * arithmetic in the deadline conversion (now.tv_sec + delta_sec, where + * delta_sec = lts.tv_sec - mono.tv_sec) cannot overflow even for adversarial + * inputs. INT64_MAX / 4 leaves four-way headroom for any pairwise sum or + * difference and still allows absolute CLOCK_REALTIME deadlines billions of + * years into the future, which comfortably covers the year-2038/2106 + * envelope. Linux saturates at KTIME_MAX (INT64_MAX ns ~ 292 years) on + * conversion to ktime_t; this code stays in struct timespec so it does not + * need that conversion, only the cap. + */ +#define FUTEX_TIMESPEC_SEC_MAX (INT64_MAX / 4) + +static int linux_timespec_is_valid(const linux_timespec_t *lts) +{ + return lts->tv_sec >= 0 && lts->tv_sec <= FUTEX_TIMESPEC_SEC_MAX && + lts->tv_nsec >= 0 && lts->tv_nsec < 1000000000L; +} + /* Convert a Linux guest timespec to an absolute struct timespec deadline. * For FUTEX_WAIT (relative timeout), adds the duration to the current time. * For FUTEX_WAIT_BITSET (absolute timeout), uses the value directly. - * Returns 0 on success, -1 if the guest pointer is invalid. + * Returns 0 on success, -1 if the guest pointer is invalid, -2 if the guest + * timespec is malformed. */ static int futex_make_deadline(guest_t *g, uint64_t timeout_gva, @@ -159,6 +197,8 @@ static int futex_make_deadline(guest_t *g, linux_timespec_t lts; if (guest_read_small(g, timeout_gva, <s, sizeof(lts)) < 0) return -1; + if (!linux_timespec_is_valid(<s)) + return -2; if (is_absolute) { out->tv_sec = (time_t) lts.tv_sec; @@ -194,8 +234,11 @@ static int64_t futex_wait(guest_t *g, bool has_timeout = (timeout_gva != 0); struct timespec deadline; if (has_timeout) { - if (futex_make_deadline(g, timeout_gva, is_absolute, &deadline) < 0) + int rc = futex_make_deadline(g, timeout_gva, is_absolute, &deadline); + if (rc == -1) return -LINUX_EFAULT; + if (rc == -2) + return -LINUX_EINVAL; } pthread_mutex_lock(&b->lock); @@ -346,6 +389,7 @@ static int64_t futex_wake(uint64_t uaddr, uint32_t val, uint32_t bitset) *pp = w->next; /* Unlink before signaling */ __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE); pthread_cond_signal(&w->cond); + futex_waiter_notify_group(w); woken++; } else { pp = &w->next; @@ -428,6 +472,7 @@ static int64_t futex_requeue(guest_t *g, *pp = w->next; __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE); pthread_cond_signal(&w->cond); + futex_waiter_notify_group(w); woken++; /* Leave pp unchanged because *pp is already the next node */ } else if ((uint32_t) requeued < requeue_count) { @@ -562,6 +607,7 @@ static int64_t futex_wake_op(guest_t *g, *pp1 = w->next; __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE); pthread_cond_signal(&w->cond); + futex_waiter_notify_group(w); woken++; } else { pp1 = &w->next; @@ -605,6 +651,7 @@ static int64_t futex_wake_op(guest_t *g, *pp2 = w2->next; __atomic_store_n(&w2->woken, 1, __ATOMIC_RELEASE); pthread_cond_signal(&w2->cond); + futex_waiter_notify_group(w2); woken2++; } else { pp2 = &w2->next; @@ -663,9 +710,12 @@ static int64_t futex_lock_pi(guest_t *g, uint64_t uaddr, uint64_t timeout_gva) bool has_timeout = (timeout_gva != 0); struct timespec deadline; if (has_timeout) { - if (futex_make_deadline(g, timeout_gva, /*is_absolute=*/1, &deadline) < - 0) + int rc = + futex_make_deadline(g, timeout_gva, /*is_absolute=*/1, &deadline); + if (rc == -1) return -LINUX_EFAULT; + if (rc == -2) + return -LINUX_EINVAL; } unsigned idx = futex_hash(uaddr); @@ -901,6 +951,7 @@ static int64_t futex_unlock_pi(guest_t *g, uint64_t uaddr) *pp = w->next; /* Unlink before signaling */ __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE); pthread_cond_signal(&w->cond); + futex_waiter_notify_group(w); break; /* Wake exactly one */ } pp = &w->next; @@ -974,13 +1025,45 @@ int futex_wake_one(guest_t *g, uint64_t uaddr) return (int) futex_wake(uaddr, 1, FUTEX_BITSET_MATCH_ANY); } -/* Unlink a waiter from its bucket's list, taking the bucket lock. */ +/* Unlink a waiter from whichever bucket it currently sits in, with retry on + * concurrent requeue. The waiter's struct lives on the calling thread's stack; + * leaving a dangling reference behind is a real host-safety bug because a + * later wake at the new uaddr would dereference it. The regular futex_wait + * self-dequeue path handles the same race the same way. + * + * Termination: on each iteration we either find w in the bucket (unlink and + * return), or observe w->woken==1 under the bucket lock (the wake path + * unlinks before storing woken with RELEASE under the bucket lock; once we + * acquire that bucket lock we synchronize with it), or determine w was + * requeued elsewhere (re-hash and retry). Forward progress is guaranteed + * because every requeue and every wake also holds bucket locks, so once we + * take the lock for the bucket that hashes w's current uaddr, no concurrent + * mover can step around us. + */ static void waitv_unlink(futex_waiter_t *w) { - futex_bucket_t *b = &buckets[futex_hash(w->uaddr)]; - pthread_mutex_lock(&b->lock); - bucket_unlink_locked(b, w); - pthread_mutex_unlock(&b->lock); + if (__atomic_load_n(&w->woken, __ATOMIC_ACQUIRE)) + return; + for (;;) { + unsigned idx = futex_hash(w->uaddr); + futex_bucket_t *b = &buckets[idx]; + pthread_mutex_lock(&b->lock); + bool found = false; + for (futex_waiter_t **pp = &b->head; *pp; pp = &(*pp)->next) { + if (*pp == w) { + *pp = w->next; + found = true; + break; + } + } + bool was_woken = __atomic_load_n(&w->woken, __ATOMIC_ACQUIRE); + pthread_mutex_unlock(&b->lock); + if (found || was_woken) + return; + /* w must have been requeued to another bucket while we hashed. + * Re-read uaddr and try again. + */ + } } /* futex_waitv (SYS 449): batch futex wait on multiple addresses. @@ -1006,15 +1089,21 @@ typedef struct { _Static_assert(sizeof(linux_futex_waitv_t) == 24, "futex_waitv element must be 24 bytes"); -/* Shared poll state for futex_waitv. The mutex+cond pair serves only as a timed - * sleep primitive; futex_wake does not signal shared.cond directly. The poll - * loop checks waiter.woken flags periodically. +/* Shared wakeup state for futex_waitv. Each enqueued waiter holds pointers to + * this struct so any wake site (futex_wake, futex_requeue, futex_wake_op, + * futex_unlock_pi) signals shared.cond after marking the waiter woken. The + * polling loop sleeps on shared.cond with a bounded timeout so it still picks + * up exit_group requests and real timeouts even when no signal arrives. */ typedef struct { pthread_mutex_t lock; pthread_cond_t cond; } waitv_shared_t; +/* Linux clockid values accepted by futex_waitv. */ +#define LINUX_CLOCK_REALTIME 0 +#define LINUX_CLOCK_MONOTONIC 1 + static int waitv_collect_buckets(const linux_futex_waitv_t *elts, uint32_t nr_futexes, unsigned bucket_ids[FUTEX_WAITV_MAX], @@ -1047,13 +1136,64 @@ int64_t sys_futex_waitv(guest_t *g, uint64_t waiters_gva, uint32_t nr_futexes, uint32_t flags, - uint64_t timeout_gva) + uint64_t timeout_gva, + int clockid) { + /* Validation order matches Linux do_futex_waitv(): + * 1. flags + * 2. nr_futexes / !waiters + * 3. clockid (when timeout != NULL) + * 4. copy_from_user(timeout) -> EFAULT + * 5. timespec64_valid(timeout) -> EINVAL + * 6. copy_from_user(waiters) -> EFAULT + * 7. per-element validate -> EINVAL + * Reordering steps 4-7 to match Linux means a guest that passes a bad + * timeout AND bad waiters sees the same errno Linux would, instead of + * having ours fault on waiters first. + */ if (flags != 0) return -LINUX_EINVAL; - if (nr_futexes == 0 || nr_futexes > FUTEX_WAITV_MAX) + if (nr_futexes == 0 || nr_futexes > FUTEX_WAITV_MAX || waiters_gva == 0) return -LINUX_EINVAL; + bool has_timeout = (timeout_gva != 0); + if (has_timeout && clockid != LINUX_CLOCK_REALTIME && + clockid != LINUX_CLOCK_MONOTONIC) + return -LINUX_EINVAL; + + /* Copy and validate the timeout before reading the waiters array. */ + struct timespec deadline; + if (has_timeout) { + linux_timespec_t lts; + if (guest_read_small(g, timeout_gva, <s, sizeof(lts)) < 0) + return -LINUX_EFAULT; + if (!linux_timespec_is_valid(<s)) + return -LINUX_EINVAL; + + if (clockid == LINUX_CLOCK_MONOTONIC) { + /* Translate the monotonic absolute deadline to a CLOCK_REALTIME + * absolute deadline so pthread_cond_timedwait (which uses + * CLOCK_REALTIME) waits the right amount. macOS has no + * CLOCK_MONOTONIC condattr, so this conversion is unavoidable; + * minor wall-clock skew is accepted. lts.tv_sec is bounded by + * FUTEX_TIMESPEC_SEC_MAX (linux_timespec_is_valid), so the + * subtraction and addition stay inside int64_t / time_t range. + */ + struct timeval now; + gettimeofday(&now, NULL); + struct timespec mono; + clock_gettime(CLOCK_MONOTONIC, &mono); + int64_t delta_sec = lts.tv_sec - mono.tv_sec; + long delta_nsec = (long) lts.tv_nsec - mono.tv_nsec; + deadline.tv_sec = now.tv_sec + delta_sec; + deadline.tv_nsec = (long) now.tv_usec * 1000 + delta_nsec; + } else { + deadline.tv_sec = (time_t) lts.tv_sec; + deadline.tv_nsec = (long) lts.tv_nsec; + } + timespec_normalize(&deadline); + } + linux_futex_waitv_t elts[FUTEX_WAITV_MAX]; size_t sz = nr_futexes * sizeof(linux_futex_waitv_t); if (guest_read_small(g, waiters_gva, elts, sz) < 0) @@ -1066,26 +1206,12 @@ int64_t sys_futex_waitv(guest_t *g, return -LINUX_EINVAL; if ((elts[i].flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32) return -LINUX_EINVAL; - } - - /* Convert CLOCK_MONOTONIC absolute deadline to CLOCK_REALTIME for - * pthread_cond_timedwait (macOS has no CLOCK_MONOTONIC condattr). - */ - bool has_timeout = (timeout_gva != 0); - struct timespec deadline; - if (has_timeout) { - linux_timespec_t lts; - if (guest_read_small(g, timeout_gva, <s, sizeof(lts)) < 0) - return -LINUX_EFAULT; - struct timeval now; - gettimeofday(&now, NULL); - struct timespec mono; - clock_gettime(CLOCK_MONOTONIC, &mono); - int64_t delta_sec = lts.tv_sec - mono.tv_sec; - long delta_nsec = (long) lts.tv_nsec - mono.tv_nsec; - deadline.tv_sec = now.tv_sec + delta_sec; - deadline.tv_nsec = (long) now.tv_usec * 1000 + delta_nsec; - timespec_normalize(&deadline); + /* uaddr must be naturally aligned for the declared size. For + * FUTEX2_SIZE_U32 that is 4-byte alignment; an unaligned futex word + * loses atomicity on aarch64 and matches no kernel-side behavior. + */ + if (elts[i].uaddr & 0x3) + return -LINUX_EINVAL; } waitv_shared_t shared; @@ -1129,6 +1255,8 @@ int64_t sys_futex_waitv(guest_t *g, w->bitset = FUTEX_BITSET_MATCH_ANY; w->woken = 0; w->next = b->head; + w->group_lock = &shared.lock; + w->group_cond = &shared.cond; pthread_cond_init(&w->cond, NULL); b->head = w; enqueued++; @@ -1137,16 +1265,14 @@ int64_t sys_futex_waitv(guest_t *g, for (int i = nbuckets - 1; i >= 0; i--) pthread_mutex_unlock(&bucket_ptrs[i]->lock); - /* All enqueued. Wait for any one to be woken. Poll periodically to check - * all waiters (the waker signals the waiter's own cond). + /* All enqueued. Block on shared.cond until any wake site signals it. + * The bounded sleep (capped at 500ms or the user deadline, whichever is + * sooner) gives proc_exit_group_requested() and timeout checks a chance to + * run if the cond_signal never arrives. */ int result_idx = -1; pthread_mutex_lock(&shared.lock); for (;;) { - /* Check if any waiter was woken. Use acquire load to synchronize with - * the release store in futex_wake (which sets woken=1 under the bucket - * lock, but the polling thread reads outside that lock). - */ for (uint32_t i = 0; i < nr_futexes; i++) { if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) { result_idx = (int) i; @@ -1161,53 +1287,46 @@ int64_t sys_futex_waitv(guest_t *g, break; } - /* Poll with 50ms timeout to check for wakeups across buckets. - * futex_wake sets waiter.woken=1 and signals waiter.cond, but waitv - * doesn't block on individual waiter conds; it polls all of them. This - * gives up to 50ms latency per wakeup. For lower latency, each - * waiter.cond could also broadcast to shared.cond, but that requires - * modifying the generic futex_wake path. - * Acceptable for now since futex_waitv is mainly for Wine/Proton which - * is not yet a target workload. - */ - struct timespec poll_ts; - timespec_deadline_in_ms(&poll_ts, 50); - + struct timespec wait_ts; + timespec_deadline_in_ms(&wait_ts, 500); if (has_timeout) { - /* Use earlier of poll_ts and deadline */ - if (deadline.tv_sec < poll_ts.tv_sec || - (deadline.tv_sec == poll_ts.tv_sec && - deadline.tv_nsec < poll_ts.tv_nsec)) { - poll_ts = deadline; + if (deadline.tv_sec < wait_ts.tv_sec || + (deadline.tv_sec == wait_ts.tv_sec && + deadline.tv_nsec < wait_ts.tv_nsec)) { + wait_ts = deadline; } } - pthread_cond_timedwait(&shared.cond, &shared.lock, &poll_ts); - - for (uint32_t i = 0; i < nr_futexes; i++) { - if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) { - result_idx = (int) i; - break; - } - } - if (result_idx >= 0) - break; + pthread_cond_timedwait(&shared.cond, &shared.lock, &wait_ts); if (has_timeout) { struct timeval now; gettimeofday(&now, NULL); long now_ns = (long) now.tv_usec * 1000; - if (now.tv_sec > deadline.tv_sec || - (now.tv_sec == deadline.tv_sec && now_ns >= deadline.tv_nsec)) { - result_idx = -LINUX_ETIMEDOUT; + bool past_deadline = + now.tv_sec > deadline.tv_sec || + (now.tv_sec == deadline.tv_sec && now_ns >= deadline.tv_nsec); + if (past_deadline) { + /* Re-check woken under shared.lock before declaring a timeout: + * a wake that arrived during the cond_timedwait may not have + * been signalled yet on this thread but the woken flag is set. + */ + for (uint32_t i = 0; i < nr_futexes; i++) { + if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) { + result_idx = (int) i; + break; + } + } + if (result_idx < 0) + result_idx = -LINUX_ETIMEDOUT; break; } } } pthread_mutex_unlock(&shared.lock); - /* Unlink all waiters (woken entries are usually already removed by - * futex_wake, but a second pass is harmless and avoids stale pointers). + /* Unlink all waiters (woken entries are already removed by the wake path, + * but a second pass is harmless and avoids stale pointers). */ for (uint32_t i = 0; i < nr_futexes; i++) waitv_unlink(&waiters[i]); diff --git a/src/runtime/futex.h b/src/runtime/futex.h index 430f885..79f60eb 100644 --- a/src/runtime/futex.h +++ b/src/runtime/futex.h @@ -51,13 +51,16 @@ int64_t sys_futex(guest_t *g, int futex_wake_one(guest_t *g, uint64_t uaddr); /* futex_waitv (SYS 449): batch wait on multiple futex addresses. + * clockid selects the timeout clock (Linux CLOCK_REALTIME=0 or + * CLOCK_MONOTONIC=1); ignored when timeout_gva==0. * Returns the index of the woken futex, or negative errno. */ int64_t sys_futex_waitv(guest_t *g, uint64_t waiters_gva, uint32_t nr_futexes, uint32_t flags, - uint64_t timeout_gva); + uint64_t timeout_gva, + int clockid); /* Walk the robust futex list on thread exit and set FUTEX_OWNER_DIED * on each held lock. Wakes one waiter per lock so a new owner can diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 7d82918..11794eb 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1445,10 +1445,9 @@ static int64_t sc_futex_waitv(guest_t *g, uint64_t x5, bool verbose) { - (void) x4; (void) x5; (void) verbose; - return sys_futex_waitv(g, x0, (uint32_t) x1, (uint32_t) x2, x3); + return sys_futex_waitv(g, x0, (uint32_t) x1, (uint32_t) x2, x3, (int) x4); } /* Generated dispatch table. */ diff --git a/tests/manifest.txt b/tests/manifest.txt index 69be913..22b75f6 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -110,6 +110,9 @@ test-inotify [section] PI futex + EINTR regression tests test-futex-pi # diff=skip +[section] futex_waitv (SYS 449) tests +test-futex-waitv # diff=skip + [section] SIGILL / null guard tests test-sigill diff --git a/tests/test-futex-waitv.c b/tests/test-futex-waitv.c new file mode 100644 index 0000000..3a007d9 --- /dev/null +++ b/tests/test-futex-waitv.c @@ -0,0 +1,418 @@ +/* futex_waitv (SYS 449) regression + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Tests: + * 1. Single waiter wakes when its futex is FUTEX_WAKEd; returns index 0. + * 2. Multi waiter: a wake on element[N] returns N, not 0 or another index. + * 3. EAGAIN when any element's val mismatches the in-memory word. + * 4. EINVAL on nr_futexes=0, nr_futexes>128, top-level flags!=0, + * element flags with reserved bits, element size != FUTEX2_SIZE_U32, + * and clockid not in {CLOCK_REALTIME, CLOCK_MONOTONIC} with non-NULL + * timeout. + * 5. ETIMEDOUT when no waker arrives before an absolute CLOCK_MONOTONIC + * deadline. + * 6. ETIMEDOUT under CLOCK_REALTIME deadline (selected via the 5th syscall + * argument). Verifies that the dispatch wrapper actually forwards x4. + * + * Syscalls exercised: futex_waitv(449), futex(98), nanosleep(101). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "raw-syscall.h" + +#ifndef __NR_futex_waitv +#define __NR_futex_waitv 449 +#endif + +#define FUTEX_WAITV_MAX 128 +#define FUTEX2_SIZE_U32 0x02 +#define FUTEX2_PRIVATE 0x80 + +#ifndef CLOCK_REALTIME +#define CLOCK_REALTIME 0 +#endif +#ifndef CLOCK_MONOTONIC +#define CLOCK_MONOTONIC 1 +#endif + +struct waitv_elem { + uint64_t val; + uint64_t uaddr; + uint32_t flags; + uint32_t __reserved; +}; + +int passes = 0, fails = 0; + +static long raw_futex_waitv(struct waitv_elem *waiters, + unsigned int nr, + unsigned int flags, + struct timespec *timeout, + int clockid) +{ + return raw_syscall5(__NR_futex_waitv, (long) waiters, (long) nr, + (long) flags, (long) timeout, (long) clockid); +} + +/* Helper: wake @addr after @sleep_ms milliseconds. Used to unblock the main + * thread's futex_waitv. The thread is joined by the test before the test + * returns, so the wake completes before any state on the test's stack goes + * out of scope. + */ +struct waker_args { + uint32_t *addr; + long sleep_ms; +}; + +static void *waker_thread(void *arg) +{ + struct waker_args *a = (struct waker_args *) arg; + struct timespec ts = {.tv_sec = a->sleep_ms / 1000, + .tv_nsec = (a->sleep_ms % 1000) * 1000000L}; + nanosleep(&ts, NULL); + raw_futex_wake((int *) a->addr, 1); + return NULL; +} + +static void test_single_wake(void) +{ + TEST("single waiter wakes index 0"); + + uint32_t f __attribute__((aligned(4))) = 0; + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) &f, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + + pthread_t tid; + struct waker_args wa = {.addr = &f, .sleep_ms = 50}; + pthread_create(&tid, NULL, waker_thread, &wa); + + long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC); + pthread_join(tid, NULL); + + EXPECT_EQ(r, 0, "expected index 0"); +} + +static void test_multi_wake_index(void) +{ + TEST("multi waiter returns woken index"); + + uint32_t f[4] __attribute__((aligned(4))) = {0, 0, 0, 0}; + struct waitv_elem ws[4]; + for (int i = 0; i < 4; i++) { + ws[i].val = 0; + ws[i].uaddr = (uint64_t) (uintptr_t) &f[i]; + ws[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE; + ws[i].__reserved = 0; + } + + pthread_t tid; + struct waker_args wa = {.addr = &f[2], .sleep_ms = 50}; + pthread_create(&tid, NULL, waker_thread, &wa); + + long r = raw_futex_waitv(ws, 4, 0, NULL, CLOCK_MONOTONIC); + pthread_join(tid, NULL); + + EXPECT_EQ(r, 2, "expected index 2"); +} + +static void test_eagain_stale(void) +{ + TEST("EAGAIN on stale value"); + + uint32_t f[2] __attribute__((aligned(4))) = {0, 7}; + struct waitv_elem ws[2] = { + {.val = 0, + .uaddr = (uint64_t) (uintptr_t) &f[0], + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE}, + /* second element has a stale expected value; futex_waitv should fail + * the whole batch with EAGAIN before blocking. + */ + {.val = 99, + .uaddr = (uint64_t) (uintptr_t) &f[1], + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE}, + }; + + long r = raw_futex_waitv(ws, 2, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -11 /* -EAGAIN */, "expected -EAGAIN"); +} + +static void test_einval_paths(void) +{ + uint32_t f __attribute__((aligned(4))) = 0; + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) &f, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + + TEST("EINVAL nr_futexes=0"); + long r = raw_futex_waitv(&w, 0, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "nr=0"); + + TEST("EINVAL nr_futexes>128"); + r = raw_futex_waitv(&w, FUTEX_WAITV_MAX + 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "nr>128"); + + TEST("EINVAL top-level flags!=0"); + r = raw_futex_waitv(&w, 1, 1, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "flags!=0"); + + TEST("EINVAL reserved!=0"); + { + struct waitv_elem bad = w; + bad.__reserved = 1; + r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "reserved!=0"); + } + + TEST("EINVAL element flags reserved bits"); + { + struct waitv_elem bad = w; + bad.flags = FUTEX2_SIZE_U32 | 0x100; /* bit 8 is reserved */ + r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "elt-flags reserved"); + } + + TEST("EINVAL element size != U32"); + { + struct waitv_elem bad = w; + bad.flags = 0x01 /* SIZE_U16 */ | FUTEX2_PRIVATE; + r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "size!=U32"); + } + + TEST("EINVAL bad clockid with timeout"); + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + ts.tv_sec += 1; + r = raw_futex_waitv(&w, 1, 0, &ts, 7 /* CLOCK_BOOTTIME */); + EXPECT_RAW_ERRNO(r, -22, "bad clockid"); + } + + TEST("EINVAL bad CLOCK_REALTIME deadline nsec"); + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_nsec = 1000000000L; + r = raw_futex_waitv(&w, 1, 0, &ts, CLOCK_REALTIME); + EXPECT_RAW_ERRNO(r, -22, "bad realtime timeout"); + } + + TEST("EINVAL bad CLOCK_MONOTONIC deadline nsec"); + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + ts.tv_nsec = 1000000000L; + r = raw_futex_waitv(&w, 1, 0, &ts, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "bad monotonic timeout"); + } +} + +static void test_timeout_monotonic(void) +{ + TEST("ETIMEDOUT CLOCK_MONOTONIC deadline"); + + uint32_t f __attribute__((aligned(4))) = 0; + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) &f, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + + struct timespec deadline; + clock_gettime(CLOCK_MONOTONIC, &deadline); + deadline.tv_nsec += 100 * 1000000L; /* +100 ms */ + if (deadline.tv_nsec >= 1000000000L) { + deadline.tv_sec += 1; + deadline.tv_nsec -= 1000000000L; + } + + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + long r = raw_futex_waitv(&w, 1, 0, &deadline, CLOCK_MONOTONIC); + clock_gettime(CLOCK_MONOTONIC, &t1); + + long elapsed_ms = + (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_nsec - t0.tv_nsec) / 1000000L; + if (r == -110 /* -ETIMEDOUT */ && elapsed_ms >= 50 && elapsed_ms <= 2000) { + PASS(); + } else { + printf("FAIL: r=%ld elapsed=%ldms (expected -110, 50<=elapsed<=2000)\n", + r, elapsed_ms); + fails++; + } +} + +static void test_timeout_realtime(void) +{ + TEST("ETIMEDOUT CLOCK_REALTIME deadline"); + + uint32_t f __attribute__((aligned(4))) = 0; + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) &f, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + + struct timespec deadline; + clock_gettime(CLOCK_REALTIME, &deadline); + deadline.tv_nsec += 100 * 1000000L; /* +100 ms */ + if (deadline.tv_nsec >= 1000000000L) { + deadline.tv_sec += 1; + deadline.tv_nsec -= 1000000000L; + } + + struct timespec t0, t1; + clock_gettime(CLOCK_MONOTONIC, &t0); + long r = raw_futex_waitv(&w, 1, 0, &deadline, CLOCK_REALTIME); + clock_gettime(CLOCK_MONOTONIC, &t1); + + long elapsed_ms = + (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_nsec - t0.tv_nsec) / 1000000L; + if (r == -110 && elapsed_ms >= 50 && elapsed_ms <= 2000) { + PASS(); + } else { + printf("FAIL: r=%ld elapsed=%ldms (expected -110, 50<=elapsed<=2000)\n", + r, elapsed_ms); + fails++; + } +} + +/* Reviewer-driven coverage: alignment, NULL pointers, exact 128, faults. */ + +static void test_einval_unaligned(void) +{ + TEST("EINVAL unaligned uaddr"); + + /* Build a uaddr that is 1 byte off the natural 4-byte boundary. The + * underlying storage is still inside a writable mapping, so this + * exercises the alignment check rather than a fault path. + */ + static uint8_t buf[16] __attribute__((aligned(4))); + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) (buf + 1), + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22, "expected -EINVAL on unaligned uaddr"); +} + +static void test_einval_null_waiters(void) +{ + TEST("EINVAL NULL waiters pointer"); + + /* The Linux kernel rejects waiters==NULL up-front with -EINVAL (the + * !waiters branch in the !nr_futexes||nr_futexes>FUTEX_WAITV_MAX + * predicate), not at copy_from_user time. Match that. + */ + long r = raw_futex_waitv(NULL, 1, 0, NULL, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -22 /* -EINVAL */, "expected -EINVAL on NULL waiters"); +} + +static void test_efault_timeout(void) +{ + TEST("EFAULT NULL-page timeout pointer"); + + uint32_t f __attribute__((aligned(4))) = 0; + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) &f, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + + /* Carve a PROT_NONE page so the timeout pointer is non-NULL but reads + * fault at copy time. + */ + void *p = mmap(NULL, 4096, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap PROT_NONE setup failed"); + return; + } + struct timespec *bad = (struct timespec *) p; + long r = raw_futex_waitv(&w, 1, 0, bad, CLOCK_MONOTONIC); + munmap(p, 4096); + EXPECT_RAW_ERRNO(r, -14, "expected -EFAULT on faulting timeout"); +} + +static void test_efault_uaddr(void) +{ + TEST("EFAULT PROT_NONE uaddr at enqueue"); + + /* Map a PROT_NONE page and aim a waiter at it. Linux returns EFAULT + * when the kernel tries to read *uaddr to compare against val. + */ + void *p = mmap(NULL, 4096, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap PROT_NONE setup failed"); + return; + } + struct waitv_elem w = { + .val = 0, + .uaddr = (uint64_t) (uintptr_t) p, + .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE, + }; + long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC); + munmap(p, 4096); + EXPECT_RAW_ERRNO(r, -14, "expected -EFAULT on PROT_NONE uaddr"); +} + +static void test_max_nr_128(void) +{ + TEST("nr_futexes==128 accepted (timeout path)"); + + /* Allocate 128 4-byte slots in a single 4 KiB page so they all fit. The + * call uses an immediate deadline (already-past timestamp) so it returns + * ETIMEDOUT instead of blocking; this still pins down the inclusive 128 + * upper bound. + */ + static uint32_t slots[128] __attribute__((aligned(4))); + struct waitv_elem ws[128]; + for (int i = 0; i < 128; i++) { + slots[i] = 0; + ws[i].val = 0; + ws[i].uaddr = (uint64_t) (uintptr_t) &slots[i]; + ws[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE; + ws[i].__reserved = 0; + } + struct timespec deadline; + clock_gettime(CLOCK_MONOTONIC, &deadline); + /* Already-past deadline forces immediate ETIMEDOUT */ + if (deadline.tv_sec > 0) + deadline.tv_sec -= 1; + long r = raw_futex_waitv(ws, 128, 0, &deadline, CLOCK_MONOTONIC); + EXPECT_RAW_ERRNO(r, -110, "expected -ETIMEDOUT (128 elements accepted)"); +} + +int main(void) +{ + printf("test-futex-waitv:\n"); + test_single_wake(); + test_multi_wake_index(); + test_eagain_stale(); + test_einval_paths(); + test_einval_unaligned(); + test_einval_null_waiters(); + test_efault_timeout(); + test_efault_uaddr(); + test_max_nr_128(); + test_timeout_monotonic(); + test_timeout_realtime(); + SUMMARY("test-futex-waitv"); + return fails == 0 ? 0 : 1; +}