Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/node_mutex.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class ConditionVariableBase {
inline void Broadcast(const ScopedLock&);
inline void Signal(const ScopedLock&);
inline void Wait(const ScopedLock& scoped_lock);
// Returns true if signaled before timeout, false if timed out.
// timeout_ns is in nanoseconds.
inline bool TimedWait(const ScopedLock& scoped_lock, uint64_t timeout_ns);

ConditionVariableBase(const ConditionVariableBase&) = delete;
ConditionVariableBase& operator=(const ConditionVariableBase&) = delete;
Expand Down Expand Up @@ -175,6 +178,12 @@ struct LibuvMutexTraits {
uv_cond_wait(cond, mutex);
}

static inline int cond_timedwait(CondT* cond,
MutexT* mutex,
uint64_t timeout) {
return uv_cond_timedwait(cond, mutex, timeout);
}

static inline void mutex_destroy(MutexT* mutex) {
uv_mutex_destroy(mutex);
}
Expand Down Expand Up @@ -249,6 +258,13 @@ void ConditionVariableBase<Traits>::Wait(const ScopedLock& scoped_lock) {
Traits::cond_wait(&cond_, &scoped_lock.mutex_.mutex_);
}

template <typename Traits>
bool ConditionVariableBase<Traits>::TimedWait(const ScopedLock& scoped_lock,
uint64_t timeout_ns) {
return Traits::cond_timedwait(
&cond_, &scoped_lock.mutex_.mutex_, timeout_ns) == 0;
}

template <typename Traits>
MutexBase<Traits>::MutexBase() {
CHECK_EQ(0, Traits::mutex_init(&mutex_));
Expand Down
60 changes: 40 additions & 20 deletions src/node_platform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,10 @@ void WorkerThreadsTaskRunner::BlockingDrain() {
pending_worker_tasks_.Lock().BlockingDrain();
}

bool WorkerThreadsTaskRunner::TimedBlockingDrain(uint64_t timeout_ns) {
return pending_worker_tasks_.Lock().TimedBlockingDrain(timeout_ns);
}

void WorkerThreadsTaskRunner::Shutdown() {
pending_worker_tasks_.Lock().Stop();
delayed_task_scheduler_->Stop();
Expand Down Expand Up @@ -580,27 +584,33 @@ void NodePlatform::DrainTasks(Isolate* isolate) {
std::shared_ptr<PerIsolatePlatformData> per_isolate = ForNodeIsolate(isolate);
if (!per_isolate) return;

// Use a timed drain instead of blocking indefinitely on outstanding worker
// tasks. This fixes the deadlock described in:
// https://github.com/nodejs/node/issues/54918
//
// A kUserBlocking worker task (e.g. Maglev JIT compilation) may post a
// foreground task and wait for it to complete. If that foreground task is
// posted after we flush the foreground queue but before we enter the wait
// inside BlockingDrain(), the main thread sleeps while the worker is also
// waiting for its foreground task — a mutual deadlock.
//
// With a timed drain, the main thread wakes up every kDrainIntervalNs and
// flushes any foreground tasks that were posted in that window, allowing
// the blocked worker task to proceed. In the common case, tasks complete
// before the timeout and NotifyOfOutstandingCompletion() wakes the wait
// immediately, so there is no performance regression.
//
// We still wait only on kUserBlocking tasks (tracked via is_outstanding())
// because they are documented to require completion before execution
// continues (e.g. wasm async compilation).
static constexpr uint64_t kDrainIntervalNs = 1'000'000; // 1ms in nanoseconds

do {
// FIXME(54918): we should not be blocking on the worker tasks on the
// main thread in one go. Doing so leads to two problems:
// 1. If any of the worker tasks post another foreground task and wait
// for it to complete, and that foreground task is posted right after
// we flush the foreground task queue and before the foreground thread
// goes into sleep, we'll never be able to wake up to execute that
// foreground task and in turn the worker task will never complete, and
// we have a deadlock.
// 2. Worker tasks can be posted from any thread, not necessarily associated
// with the current isolate, and we can be blocking on a worker task that
// is associated with a completely unrelated isolate in the event loop.
// This is suboptimal.
//
// However, not blocking on the worker tasks at all can lead to loss of some
// critical user-blocking worker tasks e.g. wasm async compilation tasks,
// which should block the main thread until they are completed, as the
// documentation suggets. As a compromise, we currently only block on
// user-blocking tasks to reduce the chance of deadlocks while making sure
// that criticl user-blocking tasks are not lost.
worker_thread_task_runner_->BlockingDrain();
while (!worker_thread_task_runner_->TimedBlockingDrain(kDrainIntervalNs)) {
// Timed out: a worker task may have posted a foreground task and is
// waiting for it. Flush the foreground queue now so it can proceed.
per_isolate->FlushForegroundTasksInternal();
}
} while (per_isolate->FlushForegroundTasksInternal());
}

Expand Down Expand Up @@ -832,6 +842,16 @@ void TaskQueue<T>::Locked::BlockingDrain() {
}
}

template <class T>
bool TaskQueue<T>::Locked::TimedBlockingDrain(uint64_t timeout_ns) {
while (queue_->outstanding_tasks_ > 0) {
if (!queue_->outstanding_tasks_drained_.TimedWait(lock_, timeout_ns)) {
return false; // timed out, outstanding tasks still pending
}
}
return true;
}

template <class T>
void TaskQueue<T>::Locked::Stop() {
queue_->stopped_ = true;
Expand Down
6 changes: 6 additions & 0 deletions src/node_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class TaskQueue {
std::unique_ptr<T> BlockingPop();
void NotifyOfOutstandingCompletion();
void BlockingDrain();
// Returns true if all outstanding tasks completed before the timeout,
// false if timed out. timeout_ns is in nanoseconds.
bool TimedBlockingDrain(uint64_t timeout_ns);
void Stop();
PriorityQueue PopAll();

Expand Down Expand Up @@ -196,6 +199,9 @@ class WorkerThreadsTaskRunner {
double delay_in_seconds);

void BlockingDrain();
// Returns true if all outstanding tasks completed before the timeout,
// false if timed out. timeout_ns is in nanoseconds.
bool TimedBlockingDrain(uint64_t timeout_ns);
void Shutdown();

int NumberOfWorkerThreads() const;
Expand Down
Loading