From 1423b4a70b4dddd3387364d74160fdc9855e4585 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Tue, 5 May 2026 15:21:03 +0800
Subject: [PATCH] Honor futex_waitv clockid and wake without polling

futex_waitv (SYS 449) is SYSCALL_DEFINE5 in the kernel: (waiters,
nr_futexes, flags, timeout, clockid). The dispatch wrapper sc_futex_waitv
was forwarding only x0..x3, dropping x4. The implementation hardcoded
a CLOCK_MONOTONIC->CLOCK_REALTIME deadline conversion, so a guest asking
for CLOCK_REALTIME got monotonic semantics regardless of what it passed.
Wire x4 through and give sys_futex_waitv a clockid parameter. Validate
against CLOCK_REALTIME or CLOCK_MONOTONIC when timeout is non-NULL
(Linux returns EINVAL otherwise) and branch the absolute-deadline
conversion on clockid.

The pre-existing waitv loop slept on a private cond that no wake site
ever signalled; forward progress relied on a 50ms poll. Add optional
group_lock / group_cond pointers to futex_waiter_t plus a
futex_waiter_notify_group helper. Each wake site (futex_wake,
futex_requeue, futex_wake_op both passes, futex_unlock_pi) calls it
after marking woken=1 under the bucket lock. waitv blocks on
shared.cond directly; the loop's bounded sleep is now 500ms, kept only
so exit_group and timeout edges are still observed when no signal
arrives.

Lock ordering is bucket -> group_lock; futex_waiter_notify_group is
only called by wake sites that already hold the bucket lock, and the
waitv thread never holds shared.lock while taking a bucket lock.
Stack lifetime of shared.lock and shared.cond is protected by the
bucket-lock pairing in waitv_unlink, which synchronizes with every
wake's notify_group before the destroys run.

Locked in by tests/test-futex-waitv.c (19 cases): single-element
wake returning index 0, multi-element wake returning the woken
index, EAGAIN on stale val, eight EINVAL paths covering nr=0,
nr>128, top-level flags, reserved!=0, element flags reserved bits,
size!=U32, malformed CLOCK_REALTIME / CLOCK_MONOTONIC nsec,
unaligned uaddr, NULL waiters_gva, bad clockid with timeout; two
EFAULT paths via PROT_NONE pages; the inclusive nr_futexes==128
boundary; and ETIMEDOUT under both CLOCK_MONOTONIC and
CLOCK_REALTIME deadlines. The CLOCK_REALTIME case exists
specifically to catch a regression of the dropped-clockid bug.
Verified against Linux ground truth via tests/qemu-runner.sh; all
19 cases match.
---
 Makefile                 |   6 +
 src/runtime/futex.c      | 263 +++++++++++++++++-------
 src/runtime/futex.h      |   5 +-
 src/syscall/syscall.c    |   3 +-
 tests/manifest.txt       |   3 +
 tests/test-futex-waitv.c | 418 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 623 insertions(+), 75 deletions(-)
 create mode 100644 tests/test-futex-waitv.c

diff --git a/Makefile b/Makefile
index 2895a93..8690e60 100644
--- a/Makefile
+++ b/Makefile
@@ -153,6 +153,12 @@ $(BUILD_DIR)/test-signalfd-hardening: tests/test-signalfd-hardening.c | $(BUILD_
 	@echo "  CROSS   $< (with -lpthread)"
 	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
 
+# test-futex-waitv needs -lpthread for the host wake-thread used to unblock
+# the main thread's futex_waitv.
+$(BUILD_DIR)/test-futex-waitv: tests/test-futex-waitv.c | $(BUILD_DIR)
+	@echo "  CROSS   $< (with -lpthread)"
+	$(Q)$(CROSS_COMPILE)gcc -D_GNU_SOURCE -static -O2 -o $@ $< -lpthread
+
 endif
 
 include mk/tests.mk
diff --git a/src/runtime/futex.c b/src/runtime/futex.c
index ca00358..0d84979 100644
--- a/src/runtime/futex.c
+++ b/src/runtime/futex.c
@@ -79,6 +79,10 @@ static _Atomic int futex_interrupt_requested = 0;
 
 /* Per-waiter node. Allocated on the host stack of the waiting thread (no malloc
  * needed; the waiter is stack-local to sys_futex).
+ *
+ * group_lock / group_cond are optional: when non-NULL, a wake additionally
+ * signals group_cond under group_lock. futex_waitv uses this so that any wake
+ * across the wait set unblocks the polling thread without per-bucket polling.
  */
 typedef struct futex_waiter {
     uint64_t uaddr;            /* Guest VA being waited on */
@@ -86,8 +90,23 @@ typedef struct futex_waiter {
     pthread_cond_t cond;       /* Signalled by WAKE to unblock this waiter */
     int woken;                 /* Set to 1 by WAKE before signalling */
     struct futex_waiter *next; /* Next waiter in same bucket */
+    pthread_mutex_t *group_lock;
+    pthread_cond_t *group_cond;
 } futex_waiter_t;
 
+/* If the waiter belongs to a futex_waitv group, signal the group's cond so the
+ * polling thread wakes immediately. Caller holds the bucket lock; group_lock is
+ * acquired below it (lock order: bucket -> group_lock).
+ */
+static void futex_waiter_notify_group(futex_waiter_t *w)
+{
+    if (!w->group_cond)
+        return;
+    pthread_mutex_lock(w->group_lock);
+    pthread_cond_signal(w->group_cond);
+    pthread_mutex_unlock(w->group_lock);
+}
+
 /* One bucket in the hash table. Protected by its own mutex.
  * Lock order: 7 (leaf locks, index-ordered when two acquired).
  */
@@ -146,10 +165,29 @@ int futex_interrupt_pending(void)
     return atomic_load(&futex_interrupt_requested);
 }
 
+/* Cap on guest-supplied tv_sec. The cap exists purely so the int64_t / time_t
+ * arithmetic in the deadline conversion (now.tv_sec + delta_sec, where
+ * delta_sec = lts.tv_sec - mono.tv_sec) cannot overflow even for adversarial
+ * inputs. INT64_MAX / 4 leaves four-way headroom for any pairwise sum or
+ * difference and still allows absolute CLOCK_REALTIME deadlines billions of
+ * years into the future, which comfortably covers the year-2038/2106
+ * envelope. Linux saturates at KTIME_MAX (INT64_MAX ns ~ 292 years) on
+ * conversion to ktime_t; this code stays in struct timespec so it does not
+ * need that conversion, only the cap.
+ */
+#define FUTEX_TIMESPEC_SEC_MAX (INT64_MAX / 4)
+
+static int linux_timespec_is_valid(const linux_timespec_t *lts)
+{
+    return lts->tv_sec >= 0 && lts->tv_sec <= FUTEX_TIMESPEC_SEC_MAX &&
+           lts->tv_nsec >= 0 && lts->tv_nsec < 1000000000L;
+}
+
 /* Convert a Linux guest timespec to an absolute struct timespec deadline.
  * For FUTEX_WAIT (relative timeout), adds the duration to the current time.
  * For FUTEX_WAIT_BITSET (absolute timeout), uses the value directly.
- * Returns 0 on success, -1 if the guest pointer is invalid.
+ * Returns 0 on success, -1 if the guest pointer is invalid, -2 if the guest
+ * timespec is malformed.
  */
 static int futex_make_deadline(guest_t *g,
                                uint64_t timeout_gva,
@@ -159,6 +197,8 @@ static int futex_make_deadline(guest_t *g,
     linux_timespec_t lts;
     if (guest_read_small(g, timeout_gva, &lts, sizeof(lts)) < 0)
         return -1;
+    if (!linux_timespec_is_valid(&lts))
+        return -2;
 
     if (is_absolute) {
         out->tv_sec = (time_t) lts.tv_sec;
@@ -194,8 +234,11 @@ static int64_t futex_wait(guest_t *g,
     bool has_timeout = (timeout_gva != 0);
     struct timespec deadline;
     if (has_timeout) {
-        if (futex_make_deadline(g, timeout_gva, is_absolute, &deadline) < 0)
+        int rc = futex_make_deadline(g, timeout_gva, is_absolute, &deadline);
+        if (rc == -1)
             return -LINUX_EFAULT;
+        if (rc == -2)
+            return -LINUX_EINVAL;
     }
 
     pthread_mutex_lock(&b->lock);
@@ -346,6 +389,7 @@ static int64_t futex_wake(uint64_t uaddr, uint32_t val, uint32_t bitset)
             *pp = w->next; /* Unlink before signaling */
             __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE);
             pthread_cond_signal(&w->cond);
+            futex_waiter_notify_group(w);
             woken++;
         } else {
             pp = &w->next;
@@ -428,6 +472,7 @@ static int64_t futex_requeue(guest_t *g,
             *pp = w->next;
             __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE);
             pthread_cond_signal(&w->cond);
+            futex_waiter_notify_group(w);
             woken++;
             /* Leave pp unchanged because *pp is already the next node */
         } else if ((uint32_t) requeued < requeue_count) {
@@ -562,6 +607,7 @@ static int64_t futex_wake_op(guest_t *g,
             *pp1 = w->next;
             __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE);
             pthread_cond_signal(&w->cond);
+            futex_waiter_notify_group(w);
             woken++;
         } else {
             pp1 = &w->next;
@@ -605,6 +651,7 @@ static int64_t futex_wake_op(guest_t *g,
                 *pp2 = w2->next;
                 __atomic_store_n(&w2->woken, 1, __ATOMIC_RELEASE);
                 pthread_cond_signal(&w2->cond);
+                futex_waiter_notify_group(w2);
                 woken2++;
             } else {
                 pp2 = &w2->next;
@@ -663,9 +710,12 @@ static int64_t futex_lock_pi(guest_t *g, uint64_t uaddr, uint64_t timeout_gva)
     bool has_timeout = (timeout_gva != 0);
     struct timespec deadline;
     if (has_timeout) {
-        if (futex_make_deadline(g, timeout_gva, /*is_absolute=*/1, &deadline) <
-            0)
+        int rc =
+            futex_make_deadline(g, timeout_gva, /*is_absolute=*/1, &deadline);
+        if (rc == -1)
             return -LINUX_EFAULT;
+        if (rc == -2)
+            return -LINUX_EINVAL;
     }
 
     unsigned idx = futex_hash(uaddr);
@@ -901,6 +951,7 @@ static int64_t futex_unlock_pi(guest_t *g, uint64_t uaddr)
             *pp = w->next; /* Unlink before signaling */
             __atomic_store_n(&w->woken, 1, __ATOMIC_RELEASE);
             pthread_cond_signal(&w->cond);
+            futex_waiter_notify_group(w);
             break; /* Wake exactly one */
         }
         pp = &w->next;
@@ -974,13 +1025,45 @@ int futex_wake_one(guest_t *g, uint64_t uaddr)
     return (int) futex_wake(uaddr, 1, FUTEX_BITSET_MATCH_ANY);
 }
 
-/* Unlink a waiter from its bucket's list, taking the bucket lock. */
+/* Unlink a waiter from whichever bucket it currently sits in, with retry on
+ * concurrent requeue. The waiter's struct lives on the calling thread's stack;
+ * leaving a dangling reference behind is a real host-safety bug because a
+ * later wake at the new uaddr would dereference it. The regular futex_wait
+ * self-dequeue path handles the same race the same way.
+ *
+ * Termination: on each iteration we either find w in the bucket (unlink and
+ * return), or observe w->woken==1 under the bucket lock (the wake path
+ * unlinks before storing woken with RELEASE under the bucket lock; once we
+ * acquire that bucket lock we synchronize with it), or determine w was
+ * requeued elsewhere (re-hash and retry). Forward progress is guaranteed
+ * because every requeue and every wake also holds bucket locks, so once we
+ * take the lock for the bucket that hashes w's current uaddr, no concurrent
+ * mover can step around us.
+ */
 static void waitv_unlink(futex_waiter_t *w)
 {
-    futex_bucket_t *b = &buckets[futex_hash(w->uaddr)];
-    pthread_mutex_lock(&b->lock);
-    bucket_unlink_locked(b, w);
-    pthread_mutex_unlock(&b->lock);
+    if (__atomic_load_n(&w->woken, __ATOMIC_ACQUIRE))
+        return;
+    for (;;) {
+        unsigned idx = futex_hash(w->uaddr);
+        futex_bucket_t *b = &buckets[idx];
+        pthread_mutex_lock(&b->lock);
+        bool found = false;
+        for (futex_waiter_t **pp = &b->head; *pp; pp = &(*pp)->next) {
+            if (*pp == w) {
+                *pp = w->next;
+                found = true;
+                break;
+            }
+        }
+        bool was_woken = __atomic_load_n(&w->woken, __ATOMIC_ACQUIRE);
+        pthread_mutex_unlock(&b->lock);
+        if (found || was_woken)
+            return;
+        /* w must have been requeued to another bucket while we hashed.
+         * Re-read uaddr and try again.
+         */
+    }
 }
 
 /* futex_waitv (SYS 449): batch futex wait on multiple addresses.
@@ -1006,15 +1089,21 @@ typedef struct {
 _Static_assert(sizeof(linux_futex_waitv_t) == 24,
                "futex_waitv element must be 24 bytes");
 
-/* Shared poll state for futex_waitv. The mutex+cond pair serves only as a timed
- * sleep primitive; futex_wake does not signal shared.cond directly. The poll
- * loop checks waiter.woken flags periodically.
+/* Shared wakeup state for futex_waitv. Each enqueued waiter holds pointers to
+ * this struct so any wake site (futex_wake, futex_requeue, futex_wake_op,
+ * futex_unlock_pi) signals shared.cond after marking the waiter woken. The
+ * polling loop sleeps on shared.cond with a bounded timeout so it still picks
+ * up exit_group requests and real timeouts even when no signal arrives.
  */
 typedef struct {
     pthread_mutex_t lock;
     pthread_cond_t cond;
 } waitv_shared_t;
 
+/* Linux clockid values accepted by futex_waitv. */
+#define LINUX_CLOCK_REALTIME 0
+#define LINUX_CLOCK_MONOTONIC 1
+
 static int waitv_collect_buckets(const linux_futex_waitv_t *elts,
                                  uint32_t nr_futexes,
                                  unsigned bucket_ids[FUTEX_WAITV_MAX],
@@ -1047,13 +1136,64 @@ int64_t sys_futex_waitv(guest_t *g,
                         uint64_t waiters_gva,
                         uint32_t nr_futexes,
                         uint32_t flags,
-                        uint64_t timeout_gva)
+                        uint64_t timeout_gva,
+                        int clockid)
 {
+    /* Validation order matches Linux do_futex_waitv():
+     *   1. flags
+     *   2. nr_futexes / !waiters
+     *   3. clockid (when timeout != NULL)
+     *   4. copy_from_user(timeout) -> EFAULT
+     *   5. timespec64_valid(timeout) -> EINVAL
+     *   6. copy_from_user(waiters) -> EFAULT
+     *   7. per-element validate -> EINVAL
+     * Reordering steps 4-7 to match Linux means a guest that passes a bad
+     * timeout AND bad waiters sees the same errno Linux would, instead of
+     * having ours fault on waiters first.
+     */
     if (flags != 0)
         return -LINUX_EINVAL;
-    if (nr_futexes == 0 || nr_futexes > FUTEX_WAITV_MAX)
+    if (nr_futexes == 0 || nr_futexes > FUTEX_WAITV_MAX || waiters_gva == 0)
         return -LINUX_EINVAL;
 
+    bool has_timeout = (timeout_gva != 0);
+    if (has_timeout && clockid != LINUX_CLOCK_REALTIME &&
+        clockid != LINUX_CLOCK_MONOTONIC)
+        return -LINUX_EINVAL;
+
+    /* Copy and validate the timeout before reading the waiters array. */
+    struct timespec deadline;
+    if (has_timeout) {
+        linux_timespec_t lts;
+        if (guest_read_small(g, timeout_gva, &lts, sizeof(lts)) < 0)
+            return -LINUX_EFAULT;
+        if (!linux_timespec_is_valid(&lts))
+            return -LINUX_EINVAL;
+
+        if (clockid == LINUX_CLOCK_MONOTONIC) {
+            /* Translate the monotonic absolute deadline to a CLOCK_REALTIME
+             * absolute deadline so pthread_cond_timedwait (which uses
+             * CLOCK_REALTIME) waits the right amount. macOS has no
+             * CLOCK_MONOTONIC condattr, so this conversion is unavoidable;
+             * minor wall-clock skew is accepted. lts.tv_sec is bounded by
+             * FUTEX_TIMESPEC_SEC_MAX (linux_timespec_is_valid), so the
+             * subtraction and addition stay inside int64_t / time_t range.
+             */
+            struct timeval now;
+            gettimeofday(&now, NULL);
+            struct timespec mono;
+            clock_gettime(CLOCK_MONOTONIC, &mono);
+            int64_t delta_sec = lts.tv_sec - mono.tv_sec;
+            long delta_nsec = (long) lts.tv_nsec - mono.tv_nsec;
+            deadline.tv_sec = now.tv_sec + delta_sec;
+            deadline.tv_nsec = (long) now.tv_usec * 1000 + delta_nsec;
+        } else {
+            deadline.tv_sec = (time_t) lts.tv_sec;
+            deadline.tv_nsec = (long) lts.tv_nsec;
+        }
+        timespec_normalize(&deadline);
+    }
+
     linux_futex_waitv_t elts[FUTEX_WAITV_MAX];
     size_t sz = nr_futexes * sizeof(linux_futex_waitv_t);
     if (guest_read_small(g, waiters_gva, elts, sz) < 0)
@@ -1066,26 +1206,12 @@ int64_t sys_futex_waitv(guest_t *g,
             return -LINUX_EINVAL;
         if ((elts[i].flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32)
             return -LINUX_EINVAL;
-    }
-
-    /* Convert CLOCK_MONOTONIC absolute deadline to CLOCK_REALTIME for
-     * pthread_cond_timedwait (macOS has no CLOCK_MONOTONIC condattr).
-     */
-    bool has_timeout = (timeout_gva != 0);
-    struct timespec deadline;
-    if (has_timeout) {
-        linux_timespec_t lts;
-        if (guest_read_small(g, timeout_gva, &lts, sizeof(lts)) < 0)
-            return -LINUX_EFAULT;
-        struct timeval now;
-        gettimeofday(&now, NULL);
-        struct timespec mono;
-        clock_gettime(CLOCK_MONOTONIC, &mono);
-        int64_t delta_sec = lts.tv_sec - mono.tv_sec;
-        long delta_nsec = (long) lts.tv_nsec - mono.tv_nsec;
-        deadline.tv_sec = now.tv_sec + delta_sec;
-        deadline.tv_nsec = (long) now.tv_usec * 1000 + delta_nsec;
-        timespec_normalize(&deadline);
+        /* uaddr must be naturally aligned for the declared size. For
+         * FUTEX2_SIZE_U32 that is 4-byte alignment; an unaligned futex word
+         * loses atomicity on aarch64 and matches no kernel-side behavior.
+         */
+        if (elts[i].uaddr & 0x3)
+            return -LINUX_EINVAL;
     }
 
     waitv_shared_t shared;
@@ -1129,6 +1255,8 @@ int64_t sys_futex_waitv(guest_t *g,
         w->bitset = FUTEX_BITSET_MATCH_ANY;
         w->woken = 0;
         w->next = b->head;
+        w->group_lock = &shared.lock;
+        w->group_cond = &shared.cond;
         pthread_cond_init(&w->cond, NULL);
         b->head = w;
         enqueued++;
@@ -1137,16 +1265,14 @@ int64_t sys_futex_waitv(guest_t *g,
     for (int i = nbuckets - 1; i >= 0; i--)
         pthread_mutex_unlock(&bucket_ptrs[i]->lock);
 
-    /* All enqueued. Wait for any one to be woken. Poll periodically to check
-     * all waiters (the waker signals the waiter's own cond).
+    /* All enqueued. Block on shared.cond until any wake site signals it.
+     * The bounded sleep (capped at 500ms or the user deadline, whichever is
+     * sooner) gives proc_exit_group_requested() and timeout checks a chance to
+     * run if the cond_signal never arrives.
      */
     int result_idx = -1;
     pthread_mutex_lock(&shared.lock);
     for (;;) {
-        /* Check if any waiter was woken. Use acquire load to synchronize with
-         * the release store in futex_wake (which sets woken=1 under the bucket
-         * lock, but the polling thread reads outside that lock).
-         */
         for (uint32_t i = 0; i < nr_futexes; i++) {
             if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) {
                 result_idx = (int) i;
@@ -1161,53 +1287,46 @@ int64_t sys_futex_waitv(guest_t *g,
             break;
         }
 
-        /* Poll with 50ms timeout to check for wakeups across buckets.
-         * futex_wake sets waiter.woken=1 and signals waiter.cond, but waitv
-         * doesn't block on individual waiter conds; it polls all of them. This
-         * gives up to 50ms latency per wakeup. For lower latency, each
-         * waiter.cond could also broadcast to shared.cond, but that requires
-         * modifying the generic futex_wake path.
-         * Acceptable for now since futex_waitv is mainly for Wine/Proton which
-         * is not yet a target workload.
-         */
-        struct timespec poll_ts;
-        timespec_deadline_in_ms(&poll_ts, 50);
-
+        struct timespec wait_ts;
+        timespec_deadline_in_ms(&wait_ts, 500);
         if (has_timeout) {
-            /* Use earlier of poll_ts and deadline */
-            if (deadline.tv_sec < poll_ts.tv_sec ||
-                (deadline.tv_sec == poll_ts.tv_sec &&
-                 deadline.tv_nsec < poll_ts.tv_nsec)) {
-                poll_ts = deadline;
+            if (deadline.tv_sec < wait_ts.tv_sec ||
+                (deadline.tv_sec == wait_ts.tv_sec &&
+                 deadline.tv_nsec < wait_ts.tv_nsec)) {
+                wait_ts = deadline;
             }
         }
 
-        pthread_cond_timedwait(&shared.cond, &shared.lock, &poll_ts);
-
-        for (uint32_t i = 0; i < nr_futexes; i++) {
-            if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) {
-                result_idx = (int) i;
-                break;
-            }
-        }
-        if (result_idx >= 0)
-            break;
+        pthread_cond_timedwait(&shared.cond, &shared.lock, &wait_ts);
 
         if (has_timeout) {
             struct timeval now;
             gettimeofday(&now, NULL);
             long now_ns = (long) now.tv_usec * 1000;
-            if (now.tv_sec > deadline.tv_sec ||
-                (now.tv_sec == deadline.tv_sec && now_ns >= deadline.tv_nsec)) {
-                result_idx = -LINUX_ETIMEDOUT;
+            bool past_deadline =
+                now.tv_sec > deadline.tv_sec ||
+                (now.tv_sec == deadline.tv_sec && now_ns >= deadline.tv_nsec);
+            if (past_deadline) {
+                /* Re-check woken under shared.lock before declaring a timeout:
+                 * a wake that arrived during the cond_timedwait may not have
+                 * been signalled yet on this thread but the woken flag is set.
+                 */
+                for (uint32_t i = 0; i < nr_futexes; i++) {
+                    if (__atomic_load_n(&waiters[i].woken, __ATOMIC_ACQUIRE)) {
+                        result_idx = (int) i;
+                        break;
+                    }
+                }
+                if (result_idx < 0)
+                    result_idx = -LINUX_ETIMEDOUT;
                 break;
             }
         }
     }
     pthread_mutex_unlock(&shared.lock);
 
-    /* Unlink all waiters (woken entries are usually already removed by
-     * futex_wake, but a second pass is harmless and avoids stale pointers).
+    /* Unlink all waiters (woken entries are already removed by the wake path,
+     * but a second pass is harmless and avoids stale pointers).
      */
     for (uint32_t i = 0; i < nr_futexes; i++)
         waitv_unlink(&waiters[i]);
diff --git a/src/runtime/futex.h b/src/runtime/futex.h
index 430f885..79f60eb 100644
--- a/src/runtime/futex.h
+++ b/src/runtime/futex.h
@@ -51,13 +51,16 @@ int64_t sys_futex(guest_t *g,
 int futex_wake_one(guest_t *g, uint64_t uaddr);
 
 /* futex_waitv (SYS 449): batch wait on multiple futex addresses.
+ * clockid selects the timeout clock (Linux CLOCK_REALTIME=0 or
+ * CLOCK_MONOTONIC=1); ignored when timeout_gva==0.
  * Returns the index of the woken futex, or negative errno.
  */
 int64_t sys_futex_waitv(guest_t *g,
                         uint64_t waiters_gva,
                         uint32_t nr_futexes,
                         uint32_t flags,
-                        uint64_t timeout_gva);
+                        uint64_t timeout_gva,
+                        int clockid);
 
 /* Walk the robust futex list on thread exit and set FUTEX_OWNER_DIED
  * on each held lock. Wakes one waiter per lock so a new owner can
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index 7d82918..11794eb 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -1445,10 +1445,9 @@ static int64_t sc_futex_waitv(guest_t *g,
                               uint64_t x5,
                               bool verbose)
 {
-    (void) x4;
     (void) x5;
     (void) verbose;
-    return sys_futex_waitv(g, x0, (uint32_t) x1, (uint32_t) x2, x3);
+    return sys_futex_waitv(g, x0, (uint32_t) x1, (uint32_t) x2, x3, (int) x4);
 }
 
 /* Generated dispatch table. */
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 69be913..22b75f6 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -110,6 +110,9 @@ test-inotify
 [section] PI futex + EINTR regression tests
 test-futex-pi                  # diff=skip
 
+[section] futex_waitv (SYS 449) tests
+test-futex-waitv               # diff=skip
+
 [section] SIGILL / null guard tests
 test-sigill
 
diff --git a/tests/test-futex-waitv.c b/tests/test-futex-waitv.c
new file mode 100644
index 0000000..3a007d9
--- /dev/null
+++ b/tests/test-futex-waitv.c
@@ -0,0 +1,418 @@
+/* futex_waitv (SYS 449) regression
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Tests:
+ *   1. Single waiter wakes when its futex is FUTEX_WAKEd; returns index 0.
+ *   2. Multi waiter: a wake on element[N] returns N, not 0 or another index.
+ *   3. EAGAIN when any element's val mismatches the in-memory word.
+ *   4. EINVAL on nr_futexes=0, nr_futexes>128, top-level flags!=0,
+ *      element flags with reserved bits, element size != FUTEX2_SIZE_U32,
+ *      and clockid not in {CLOCK_REALTIME, CLOCK_MONOTONIC} with non-NULL
+ *      timeout.
+ *   5. ETIMEDOUT when no waker arrives before an absolute CLOCK_MONOTONIC
+ *      deadline.
+ *   6. ETIMEDOUT under CLOCK_REALTIME deadline (selected via the 5th syscall
+ *      argument). Verifies that the dispatch wrapper actually forwards x4.
+ *
+ * Syscalls exercised: futex_waitv(449), futex(98), nanosleep(101).
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <linux/futex.h>
+
+#include "test-harness.h"
+#include "raw-syscall.h"
+
+#ifndef __NR_futex_waitv
+#define __NR_futex_waitv 449
+#endif
+
+#define FUTEX_WAITV_MAX 128
+#define FUTEX2_SIZE_U32 0x02
+#define FUTEX2_PRIVATE 0x80
+
+#ifndef CLOCK_REALTIME
+#define CLOCK_REALTIME 0
+#endif
+#ifndef CLOCK_MONOTONIC
+#define CLOCK_MONOTONIC 1
+#endif
+
+struct waitv_elem {
+    uint64_t val;
+    uint64_t uaddr;
+    uint32_t flags;
+    uint32_t __reserved;
+};
+
+int passes = 0, fails = 0;
+
+static long raw_futex_waitv(struct waitv_elem *waiters,
+                            unsigned int nr,
+                            unsigned int flags,
+                            struct timespec *timeout,
+                            int clockid)
+{
+    return raw_syscall5(__NR_futex_waitv, (long) waiters, (long) nr,
+                        (long) flags, (long) timeout, (long) clockid);
+}
+
+/* Helper: wake @addr after @sleep_ms milliseconds. Used to unblock the main
+ * thread's futex_waitv. The thread is joined by the test before the test
+ * returns, so the wake completes before any state on the test's stack goes
+ * out of scope.
+ */
+struct waker_args {
+    uint32_t *addr;
+    long sleep_ms;
+};
+
+static void *waker_thread(void *arg)
+{
+    struct waker_args *a = (struct waker_args *) arg;
+    struct timespec ts = {.tv_sec = a->sleep_ms / 1000,
+                          .tv_nsec = (a->sleep_ms % 1000) * 1000000L};
+    nanosleep(&ts, NULL);
+    raw_futex_wake((int *) a->addr, 1);
+    return NULL;
+}
+
+static void test_single_wake(void)
+{
+    TEST("single waiter wakes index 0");
+
+    uint32_t f __attribute__((aligned(4))) = 0;
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) &f,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+
+    pthread_t tid;
+    struct waker_args wa = {.addr = &f, .sleep_ms = 50};
+    pthread_create(&tid, NULL, waker_thread, &wa);
+
+    long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC);
+    pthread_join(tid, NULL);
+
+    EXPECT_EQ(r, 0, "expected index 0");
+}
+
+static void test_multi_wake_index(void)
+{
+    TEST("multi waiter returns woken index");
+
+    uint32_t f[4] __attribute__((aligned(4))) = {0, 0, 0, 0};
+    struct waitv_elem ws[4];
+    for (int i = 0; i < 4; i++) {
+        ws[i].val = 0;
+        ws[i].uaddr = (uint64_t) (uintptr_t) &f[i];
+        ws[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
+        ws[i].__reserved = 0;
+    }
+
+    pthread_t tid;
+    struct waker_args wa = {.addr = &f[2], .sleep_ms = 50};
+    pthread_create(&tid, NULL, waker_thread, &wa);
+
+    long r = raw_futex_waitv(ws, 4, 0, NULL, CLOCK_MONOTONIC);
+    pthread_join(tid, NULL);
+
+    EXPECT_EQ(r, 2, "expected index 2");
+}
+
+static void test_eagain_stale(void)
+{
+    TEST("EAGAIN on stale value");
+
+    uint32_t f[2] __attribute__((aligned(4))) = {0, 7};
+    struct waitv_elem ws[2] = {
+        {.val = 0,
+         .uaddr = (uint64_t) (uintptr_t) &f[0],
+         .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE},
+        /* second element has a stale expected value; futex_waitv should fail
+         * the whole batch with EAGAIN before blocking.
+         */
+        {.val = 99,
+         .uaddr = (uint64_t) (uintptr_t) &f[1],
+         .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE},
+    };
+
+    long r = raw_futex_waitv(ws, 2, 0, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -11 /* -EAGAIN */, "expected -EAGAIN");
+}
+
+static void test_einval_paths(void)
+{
+    uint32_t f __attribute__((aligned(4))) = 0;
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) &f,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+
+    TEST("EINVAL nr_futexes=0");
+    long r = raw_futex_waitv(&w, 0, 0, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -22, "nr=0");
+
+    TEST("EINVAL nr_futexes>128");
+    r = raw_futex_waitv(&w, FUTEX_WAITV_MAX + 1, 0, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -22, "nr>128");
+
+    TEST("EINVAL top-level flags!=0");
+    r = raw_futex_waitv(&w, 1, 1, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -22, "flags!=0");
+
+    TEST("EINVAL reserved!=0");
+    {
+        struct waitv_elem bad = w;
+        bad.__reserved = 1;
+        r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC);
+        EXPECT_RAW_ERRNO(r, -22, "reserved!=0");
+    }
+
+    TEST("EINVAL element flags reserved bits");
+    {
+        struct waitv_elem bad = w;
+        bad.flags = FUTEX2_SIZE_U32 | 0x100; /* bit 8 is reserved */
+        r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC);
+        EXPECT_RAW_ERRNO(r, -22, "elt-flags reserved");
+    }
+
+    TEST("EINVAL element size != U32");
+    {
+        struct waitv_elem bad = w;
+        bad.flags = 0x01 /* SIZE_U16 */ | FUTEX2_PRIVATE;
+        r = raw_futex_waitv(&bad, 1, 0, NULL, CLOCK_MONOTONIC);
+        EXPECT_RAW_ERRNO(r, -22, "size!=U32");
+    }
+
+    TEST("EINVAL bad clockid with timeout");
+    {
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        ts.tv_sec += 1;
+        r = raw_futex_waitv(&w, 1, 0, &ts, 7 /* CLOCK_BOOTTIME */);
+        EXPECT_RAW_ERRNO(r, -22, "bad clockid");
+    }
+
+    TEST("EINVAL bad CLOCK_REALTIME deadline nsec");
+    {
+        struct timespec ts;
+        clock_gettime(CLOCK_REALTIME, &ts);
+        ts.tv_nsec = 1000000000L;
+        r = raw_futex_waitv(&w, 1, 0, &ts, CLOCK_REALTIME);
+        EXPECT_RAW_ERRNO(r, -22, "bad realtime timeout");
+    }
+
+    TEST("EINVAL bad CLOCK_MONOTONIC deadline nsec");
+    {
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        ts.tv_nsec = 1000000000L;
+        r = raw_futex_waitv(&w, 1, 0, &ts, CLOCK_MONOTONIC);
+        EXPECT_RAW_ERRNO(r, -22, "bad monotonic timeout");
+    }
+}
+
+static void test_timeout_monotonic(void)
+{
+    TEST("ETIMEDOUT CLOCK_MONOTONIC deadline");
+
+    uint32_t f __attribute__((aligned(4))) = 0;
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) &f,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+
+    struct timespec deadline;
+    clock_gettime(CLOCK_MONOTONIC, &deadline);
+    deadline.tv_nsec += 100 * 1000000L; /* +100 ms */
+    if (deadline.tv_nsec >= 1000000000L) {
+        deadline.tv_sec += 1;
+        deadline.tv_nsec -= 1000000000L;
+    }
+
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    long r = raw_futex_waitv(&w, 1, 0, &deadline, CLOCK_MONOTONIC);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+
+    long elapsed_ms =
+        (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_nsec - t0.tv_nsec) / 1000000L;
+    if (r == -110 /* -ETIMEDOUT */ && elapsed_ms >= 50 && elapsed_ms <= 2000) {
+        PASS();
+    } else {
+        printf("FAIL: r=%ld elapsed=%ldms (expected -110, 50<=elapsed<=2000)\n",
+               r, elapsed_ms);
+        fails++;
+    }
+}
+
+static void test_timeout_realtime(void)
+{
+    TEST("ETIMEDOUT CLOCK_REALTIME deadline");
+
+    uint32_t f __attribute__((aligned(4))) = 0;
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) &f,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+
+    struct timespec deadline;
+    clock_gettime(CLOCK_REALTIME, &deadline);
+    deadline.tv_nsec += 100 * 1000000L; /* +100 ms */
+    if (deadline.tv_nsec >= 1000000000L) {
+        deadline.tv_sec += 1;
+        deadline.tv_nsec -= 1000000000L;
+    }
+
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    long r = raw_futex_waitv(&w, 1, 0, &deadline, CLOCK_REALTIME);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+
+    long elapsed_ms =
+        (t1.tv_sec - t0.tv_sec) * 1000 + (t1.tv_nsec - t0.tv_nsec) / 1000000L;
+    if (r == -110 && elapsed_ms >= 50 && elapsed_ms <= 2000) {
+        PASS();
+    } else {
+        printf("FAIL: r=%ld elapsed=%ldms (expected -110, 50<=elapsed<=2000)\n",
+               r, elapsed_ms);
+        fails++;
+    }
+}
+
+/* Reviewer-driven coverage: alignment, NULL pointers, exact 128, faults. */
+
+static void test_einval_unaligned(void)
+{
+    TEST("EINVAL unaligned uaddr");
+
+    /* Build a uaddr that is 1 byte off the natural 4-byte boundary. The
+     * underlying storage is still inside a writable mapping, so this
+     * exercises the alignment check rather than a fault path.
+     */
+    static uint8_t buf[16] __attribute__((aligned(4)));
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) (buf + 1),
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+    long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -22, "expected -EINVAL on unaligned uaddr");
+}
+
+static void test_einval_null_waiters(void)
+{
+    TEST("EINVAL NULL waiters pointer");
+
+    /* The Linux kernel rejects waiters==NULL up-front with -EINVAL (the
+     * !waiters branch in the !nr_futexes||nr_futexes>FUTEX_WAITV_MAX
+     * predicate), not at copy_from_user time. Match that.
+     */
+    long r = raw_futex_waitv(NULL, 1, 0, NULL, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -22 /* -EINVAL */, "expected -EINVAL on NULL waiters");
+}
+
+static void test_efault_timeout(void)
+{
+    TEST("EFAULT NULL-page timeout pointer");
+
+    uint32_t f __attribute__((aligned(4))) = 0;
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) &f,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+
+    /* Carve a PROT_NONE page so the timeout pointer is non-NULL but reads
+     * fault at copy time.
+     */
+    void *p = mmap(NULL, 4096, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap PROT_NONE setup failed");
+        return;
+    }
+    struct timespec *bad = (struct timespec *) p;
+    long r = raw_futex_waitv(&w, 1, 0, bad, CLOCK_MONOTONIC);
+    munmap(p, 4096);
+    EXPECT_RAW_ERRNO(r, -14, "expected -EFAULT on faulting timeout");
+}
+
+static void test_efault_uaddr(void)
+{
+    TEST("EFAULT PROT_NONE uaddr at enqueue");
+
+    /* Map a PROT_NONE page and aim a waiter at it. Linux returns EFAULT
+     * when the kernel tries to read *uaddr to compare against val.
+     */
+    void *p = mmap(NULL, 4096, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap PROT_NONE setup failed");
+        return;
+    }
+    struct waitv_elem w = {
+        .val = 0,
+        .uaddr = (uint64_t) (uintptr_t) p,
+        .flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE,
+    };
+    long r = raw_futex_waitv(&w, 1, 0, NULL, CLOCK_MONOTONIC);
+    munmap(p, 4096);
+    EXPECT_RAW_ERRNO(r, -14, "expected -EFAULT on PROT_NONE uaddr");
+}
+
+static void test_max_nr_128(void)
+{
+    TEST("nr_futexes==128 accepted (timeout path)");
+
+    /* Allocate 128 4-byte slots in a single 4 KiB page so they all fit. The
+     * call uses an immediate deadline (already-past timestamp) so it returns
+     * ETIMEDOUT instead of blocking; this still pins down the inclusive 128
+     * upper bound.
+     */
+    static uint32_t slots[128] __attribute__((aligned(4)));
+    struct waitv_elem ws[128];
+    for (int i = 0; i < 128; i++) {
+        slots[i] = 0;
+        ws[i].val = 0;
+        ws[i].uaddr = (uint64_t) (uintptr_t) &slots[i];
+        ws[i].flags = FUTEX2_SIZE_U32 | FUTEX2_PRIVATE;
+        ws[i].__reserved = 0;
+    }
+    struct timespec deadline;
+    clock_gettime(CLOCK_MONOTONIC, &deadline);
+    /* Already-past deadline forces immediate ETIMEDOUT */
+    if (deadline.tv_sec > 0)
+        deadline.tv_sec -= 1;
+    long r = raw_futex_waitv(ws, 128, 0, &deadline, CLOCK_MONOTONIC);
+    EXPECT_RAW_ERRNO(r, -110, "expected -ETIMEDOUT (128 elements accepted)");
+}
+
+int main(void)
+{
+    printf("test-futex-waitv:\n");
+    test_single_wake();
+    test_multi_wake_index();
+    test_eagain_stale();
+    test_einval_paths();
+    test_einval_unaligned();
+    test_einval_null_waiters();
+    test_efault_timeout();
+    test_efault_uaddr();
+    test_max_nr_128();
+    test_timeout_monotonic();
+    test_timeout_realtime();
+    SUMMARY("test-futex-waitv");
+    return fails == 0 ? 0 : 1;
+}