diff --git a/src/node_mutex.h b/src/node_mutex.h index 69e64ba27c3106..b8831a9805cf66 100644 --- a/src/node_mutex.h +++ b/src/node_mutex.h @@ -139,6 +139,9 @@ class ConditionVariableBase { inline void Broadcast(const ScopedLock&); inline void Signal(const ScopedLock&); inline void Wait(const ScopedLock& scoped_lock); + // Returns true if signaled before timeout, false if timed out. + // timeout_ns is in nanoseconds. + inline bool TimedWait(const ScopedLock& scoped_lock, uint64_t timeout_ns); ConditionVariableBase(const ConditionVariableBase&) = delete; ConditionVariableBase& operator=(const ConditionVariableBase&) = delete; @@ -175,6 +178,12 @@ struct LibuvMutexTraits { uv_cond_wait(cond, mutex); } + static inline int cond_timedwait(CondT* cond, + MutexT* mutex, + uint64_t timeout) { + return uv_cond_timedwait(cond, mutex, timeout); + } + static inline void mutex_destroy(MutexT* mutex) { uv_mutex_destroy(mutex); } @@ -249,6 +258,13 @@ void ConditionVariableBase::Wait(const ScopedLock& scoped_lock) { Traits::cond_wait(&cond_, &scoped_lock.mutex_.mutex_); } +template +bool ConditionVariableBase::TimedWait(const ScopedLock& scoped_lock, + uint64_t timeout_ns) { + return Traits::cond_timedwait( + &cond_, &scoped_lock.mutex_.mutex_, timeout_ns) == 0; +} + template MutexBase::MutexBase() { CHECK_EQ(0, Traits::mutex_init(&mutex_)); diff --git a/src/node_platform.cc b/src/node_platform.cc index 197102068b74f4..7ce8db9971bac1 100644 --- a/src/node_platform.cc +++ b/src/node_platform.cc @@ -300,6 +300,10 @@ void WorkerThreadsTaskRunner::BlockingDrain() { pending_worker_tasks_.Lock().BlockingDrain(); } +bool WorkerThreadsTaskRunner::TimedBlockingDrain(uint64_t timeout_ns) { + return pending_worker_tasks_.Lock().TimedBlockingDrain(timeout_ns); +} + void WorkerThreadsTaskRunner::Shutdown() { pending_worker_tasks_.Lock().Stop(); delayed_task_scheduler_->Stop(); @@ -580,27 +584,33 @@ void NodePlatform::DrainTasks(Isolate* isolate) { std::shared_ptr per_isolate = ForNodeIsolate(isolate); if (!per_isolate) return; + // Use a timed drain instead of blocking indefinitely on outstanding worker + // tasks. This fixes the deadlock described in: + // https://github.com/nodejs/node/issues/54918 + // + // A kUserBlocking worker task (e.g. Maglev JIT compilation) may post a + // foreground task and wait for it to complete. If that foreground task is + // posted after we flush the foreground queue but before we enter the wait + // inside BlockingDrain(), the main thread sleeps while the worker is also + // waiting for its foreground task — a mutual deadlock. + // + // With a timed drain, the main thread wakes up every kDrainIntervalNs and + // flushes any foreground tasks that were posted in that window, allowing + // the blocked worker task to proceed. In the common case, tasks complete + // before the timeout and NotifyOfOutstandingCompletion() wakes the wait + // immediately, so there is no performance regression. + // + // We still wait only on kUserBlocking tasks (tracked via is_outstanding()) + // because they are documented to require completion before execution + // continues (e.g. wasm async compilation). + static constexpr uint64_t kDrainIntervalNs = 1'000'000; // 1ms in nanoseconds + do { - // FIXME(54918): we should not be blocking on the worker tasks on the - // main thread in one go. Doing so leads to two problems: - // 1. If any of the worker tasks post another foreground task and wait - // for it to complete, and that foreground task is posted right after - // we flush the foreground task queue and before the foreground thread - // goes into sleep, we'll never be able to wake up to execute that - // foreground task and in turn the worker task will never complete, and - // we have a deadlock. - // 2. Worker tasks can be posted from any thread, not necessarily associated - // with the current isolate, and we can be blocking on a worker task that - // is associated with a completely unrelated isolate in the event loop. - // This is suboptimal. - // - // However, not blocking on the worker tasks at all can lead to loss of some - // critical user-blocking worker tasks e.g. wasm async compilation tasks, - // which should block the main thread until they are completed, as the - // documentation suggets. As a compromise, we currently only block on - // user-blocking tasks to reduce the chance of deadlocks while making sure - // that criticl user-blocking tasks are not lost. - worker_thread_task_runner_->BlockingDrain(); + while (!worker_thread_task_runner_->TimedBlockingDrain(kDrainIntervalNs)) { + // Timed out: a worker task may have posted a foreground task and is + // waiting for it. Flush the foreground queue now so it can proceed. + per_isolate->FlushForegroundTasksInternal(); + } } while (per_isolate->FlushForegroundTasksInternal()); } @@ -832,6 +842,16 @@ void TaskQueue::Locked::BlockingDrain() { } } +template +bool TaskQueue::Locked::TimedBlockingDrain(uint64_t timeout_ns) { + while (queue_->outstanding_tasks_ > 0) { + if (!queue_->outstanding_tasks_drained_.TimedWait(lock_, timeout_ns)) { + return false; // timed out, outstanding tasks still pending + } + } + return true; +} + template void TaskQueue::Locked::Stop() { queue_->stopped_ = true; diff --git a/src/node_platform.h b/src/node_platform.h index f47e2a46b66b84..302f278a1c1c33 100644 --- a/src/node_platform.h +++ b/src/node_platform.h @@ -53,6 +53,9 @@ class TaskQueue { std::unique_ptr BlockingPop(); void NotifyOfOutstandingCompletion(); void BlockingDrain(); + // Returns true if all outstanding tasks completed before the timeout, + // false if timed out. timeout_ns is in nanoseconds. + bool TimedBlockingDrain(uint64_t timeout_ns); void Stop(); PriorityQueue PopAll(); @@ -196,6 +199,9 @@ class WorkerThreadsTaskRunner { double delay_in_seconds); void BlockingDrain(); + // Returns true if all outstanding tasks completed before the timeout, + // false if timed out. timeout_ns is in nanoseconds. + bool TimedBlockingDrain(uint64_t timeout_ns); void Shutdown(); int NumberOfWorkerThreads() const;