From 0b586ec25b2450ec9ab79c846cf7ea53d77fd6d7 Mon Sep 17 00:00:00 2001 From: Alessandro Vetere Date: Mon, 29 Jun 2026 18:10:27 +0200 Subject: [PATCH 1/2] MDEV-40129 Retry transient trylock failures in lock-release fast paths The trylock attempts on per-cell lock_sys_t::hash_latch (try_acquire()) and on per-table dict_table_t::lock_mutex_trylock() inside lock_release_try(), lock_release_on_prepare_try() and lock_rec_unlock_unmodified() now use a bounded spin loop (up to LOCK_RELEASE_TRY_SPIN_BUDGET CAS attempts, with MY_RELAX_CPU() between them) instead of a single CAS attempt. These paths hold trx->mutex while attempting the trylock, which is the reverse of the standard order used by lock_rec_convert_impl_to_expl(). Blocking acquisition is therefore unsafe, hence the trylock pattern. However, a single failed CAS marks the entire pass of lock_release_try() as unsuccessful, and after 5 such failed passes lock_release() falls back to exclusive lock_sys.wr_lock() for the whole transaction. That global wr_lock then blocks every concurrent lock_sys.rd_lock() acquirer in lock_rec_lock() and lock_table(), producing a server-wide convoy under heavy concurrency. The bounded spin (no syscall, no blocking) gives a transient latch holder time to release without weakening the deadlock-avoidance guarantee that motivated the trylock pattern. The extra trx->mutex hold time is bounded by LOCK_RELEASE_TRY_SPIN_BUDGET times the pause cost. This is a first, still to be fine-tuned implementation. Only the lock_release_try() path has been positively tested; the lock_release_on_prepare_try() path is not yet covered. --- storage/innobase/include/dict0mem.h | 4 ++ storage/innobase/include/lock0lock.h | 4 ++ storage/innobase/lock/lock0lock.cc | 59 +++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index 6569de9564647..3d79b41d18e84 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -2157,6 +2157,10 @@ struct dict_table_t { void lock_mutex_lock() { lock_latch.wr_lock(ut_d(SRW_LOCK_CALL)); } /** Try to acquire exclusive lock_latch */ bool lock_mutex_trylock() { return lock_latch.wr_lock_try(); } + /** Try to acquire exclusive lock_latch with a bounded spin (no syscall, + no blocking), for the lock-release fast paths that hold trx->mutex. + @return whether lock_latch was acquired */ + bool lock_mutex_trylock_spin() noexcept; /** Release exclusive lock_latch */ void lock_mutex_unlock() { lock_latch.wr_unlock(); } /** Acquire shared lock_latch */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 9c192f2a58644..06cdcf8d21470 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -663,6 +663,10 @@ class lock_sys_t /** @return whether this latch is possibly held by any thread */ bool is_locked() const noexcept { return lock.is_locked(); } #endif + /** Try to acquire the latch with a bounded spin (no syscall, no + blocking), for the lock-release fast paths that hold trx->mutex. + @return whether the latch was acquired */ + bool try_acquire_spin() noexcept; }; public: diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 62f6fd58a1b6b..657367df23a99 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -48,6 +48,7 @@ Created 5/7/1996 Heikki Tuuri #include "que0que.h" #include "scope.h" #include "buf0buf.h" +#include "my_cpu.h" #include #include @@ -4529,6 +4530,52 @@ lock_rec_unlock( g.cell(), first_lock, heap_no); } +/** Spin budget for lock_release_try(), lock_release_on_prepare_try() and +lock_rec_unlock_unmodified() trylock attempts on per-cell and per-table +latches. + +Those paths hold trx->mutex while attempting the trylock (the reverse of +the standard order used by lock_rec_convert_impl_to_expl()), so blocking +acquisition would risk deadlock. A single CAS attempt fails too readily +under contention: any one failure across the trx's record/table locks +marks the whole try-pass as unsuccessful, and after 5 such passes +lock_release() escalates to lock_sys.wr_lock() for the entire trx, which +then blocks every concurrent lock_sys.rd_lock() acquirer in lock_rec_lock() +and lock_table(). + +A bounded spin (CAS, PAUSE between attempts, no syscall, no blocking) lets a +transient holder release without changing the deadlock-avoidance guarantee. The +upper bound on extra trx->mutex hold time is LOCK_RELEASE_TRY_SPIN_BUDGET +* pause-cost (tens to ~100ns per iteration, microarchitecture-dependent). */ +static constexpr unsigned LOCK_RELEASE_TRY_SPIN_BUDGET= 16; +static_assert(LOCK_RELEASE_TRY_SPIN_BUDGET, "the !--spin loops would underflow"); + +ATTRIBUTE_NOINLINE +bool lock_sys_t::hash_latch::try_acquire_spin() noexcept +{ + for (unsigned spin= LOCK_RELEASE_TRY_SPIN_BUDGET;;) + { + if (try_acquire()) + return true; + if (!--spin) + return false; + MY_RELAX_CPU(); + } +} + +ATTRIBUTE_NOINLINE +bool dict_table_t::lock_mutex_trylock_spin() noexcept +{ + for (unsigned spin= LOCK_RELEASE_TRY_SPIN_BUDGET;;) + { + if (lock_mutex_trylock()) + return true; + if (!--spin) + return false; + MY_RELAX_CPU(); + } +} + /** Release the explicit locks of a committing transaction, and release possible other transactions waiting because of these locks. @return whether the operation succeeded */ @@ -4573,7 +4620,7 @@ static bool lock_release_try(trx_t *trx) auto &lock_hash= lock_sys.hash_get(lock->type_mode); auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold()); auto latch= lock_sys_t::hash_table::latch(cell); - if (!latch->try_acquire()) + if (!latch->try_acquire_spin()) all_released= false; else { @@ -4588,7 +4635,7 @@ static bool lock_release_try(trx_t *trx) ut_ad(table->id >= DICT_HDR_FIRST_ID || (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) || trx->dict_operation || trx->was_dict_operation); - if (!table->lock_mutex_trylock()) + if (!table->lock_mutex_trylock_spin()) all_released= false; else { @@ -4831,7 +4878,7 @@ bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell, computed cell address invalid */ cell= lock_sys.rec_hash.cell_get( lock->un_member.rec_lock.page_id.fold()); - if (!lock_sys_t::hash_table::latch(cell)->try_acquire()) + if (!lock_sys_t::hash_table::latch(cell)->try_acquire_spin()) return false; } if (lock->trx != impl_trx) @@ -4889,7 +4936,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) const auto fold = lock->un_member.rec_lock.page_id.fold(); auto cell= lock_sys.rec_hash.cell_get(fold); auto latch= lock_sys_t::hash_table::latch(cell); - if (latch->try_acquire()) + if (latch->try_acquire_spin()) { if (!rec_granted_exclusive_not_gap) { @@ -4926,7 +4973,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) computed cell address invalid */ cell= lock_sys.rec_hash.cell_get(fold); latch= lock_sys_t::hash_table::latch(cell); - if (latch->try_acquire()) + if (latch->try_acquire_spin()) { if (!lock_rec_unlock_unmodified(block, cell, lock, offsets)) @@ -4962,7 +5009,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) switch (lock->mode()) { case LOCK_IS: case LOCK_S: - if (table->lock_mutex_trylock()) + if (table->lock_mutex_trylock_spin()) { lock_table_dequeue(lock, false); table->lock_mutex_unlock(); From 7c0e3e987db1543c084254db995acd7ecccf4ea5 Mon Sep 17 00:00:00 2001 From: Alessandro Vetere Date: Tue, 23 Jun 2026 13:09:45 +0200 Subject: [PATCH 2/2] MDEV-40209 Escalate lock-release via a saturating stall counter lock_release() and lock_release_on_prepare() release a committing or preparing transaction's explicit locks under the shared lock_sys.rd_lock(), taking each per-cell hash latch and per-table lock mutex with a trylock because trx->mutex is held in the reverse of the normal latch order. A single failed trylock marked the whole pass unsuccessful, and after a fixed cap of 5 such passes the code escalated to the exclusive lock_sys.wr_lock() for the whole transaction. Under concurrency the trylocks fail transiently, so the cap escalated transactions that were still steadily releasing locks, not just stuck ones; the exclusive latch then blocks every concurrent lock_sys.rd_lock() acquirer in lock_rec_lock() and lock_table(), producing a convoy. The chance of hitting the cap rises with both the contention level and the number of latches a transaction must trylock per pass. Replace the fixed cap with a saturating stall counter (LOCK_RELEASE_MAX_STALLS, incremented on a no-progress pass, decremented on progress, floored at zero) that escalates a genuinely stuck transaction after 5 net stalls, as the fixed cap did, while leaving a transaction that keeps making progress to finish under the shared latch. A hard LOCK_RELEASE_MAX_PASSES ceiling bounds the loop independently, for the case where concurrent activity keeps adding locks (e.g. implicit-to-explicit conversion during XA PREPARE) so that progress never converges. The _try functions report progress through an out-parameter computed under trx->mutex, so trx->lock.trx_locks is never read unlatched. --- storage/innobase/lock/lock0lock.cc | 78 +++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 657367df23a99..49bd44004031d 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -4576,10 +4576,27 @@ bool dict_table_t::lock_mutex_trylock_spin() noexcept } } +/** Escalate the optimistic lock-release loop to the exclusive lock_sys.latch +once no-progress passes outpace progress passes by this much: a pass that +frees no lock increments the counter, a pass that frees one decrements it +(floored at 0). A fully stuck transaction still escalates after 5 passes, as +the previous fixed cap did. */ +static constexpr unsigned LOCK_RELEASE_MAX_STALLS= 5; + +/** Hard ceiling on optimistic release passes, independent of the stall +counter: bounds the loop when concurrent activity keeps adding to the lock set +(implicit-to-explicit conversion during XA PREPARE, or lock_rec_move() +relocation) so that progress never converges. Set well above +LOCK_RELEASE_MAX_STALLS so it only caps sustained churn. */ +static constexpr unsigned LOCK_RELEASE_MAX_PASSES= 100; + /** Release the explicit locks of a committing transaction, and release possible other transactions waiting because of these locks. +@param trx a committing transaction +@param made_progress set to whether this pass freed a lock, computed under + trx->mutex (so the caller needs no unlatched read) @return whether the operation succeeded */ -static bool lock_release_try(trx_t *trx) +static bool lock_release_try(trx_t *trx, bool *made_progress) { /* At this point, trx->lock.trx_locks cannot be modified by other threads, because our transaction has been committed. @@ -4593,6 +4610,7 @@ static bool lock_release_try(trx_t *trx) DBUG_ASSERT(!trx->is_referenced()); bool all_released= true; + *made_progress= false; restart: ulint count= 1000; /* We will not attempt hardware lock elision (memory transaction) @@ -4626,6 +4644,7 @@ static bool lock_release_try(trx_t *trx) { lock_rec_dequeue_from_page(lock, false); latch->release(); + *made_progress= true; } } else @@ -4641,6 +4660,7 @@ static bool lock_release_try(trx_t *trx) { lock_table_dequeue(lock, false); table->lock_mutex_unlock(); + *made_progress= true; } } @@ -4670,9 +4690,25 @@ void lock_release(trx_t *trx) #endif ulint count; - for (count= 5; count--; ) - if (lock_release_try(trx)) + /* Optimistic release under the shared latch. lock_release_try() reports + whether it freed a lock, computed under trx->mutex: we must not read + trx_locks here, as a concurrent lock_rec_move() can change it between + passes. Escalate on LOCK_RELEASE_MAX_STALLS net stalls or + LOCK_RELEASE_MAX_PASSES total passes. */ + unsigned stalls= 0; + for (count= 0; count < LOCK_RELEASE_MAX_PASSES; count++) + { + bool progressed; + if (lock_release_try(trx, &progressed)) goto released; + if (progressed) + { + if (stalls) + stalls--; + } + else if (++stalls == LOCK_RELEASE_MAX_STALLS) + break; + } /* Fall back to acquiring lock_sys.latch in exclusive mode */ restart: @@ -4798,13 +4834,16 @@ static void lock_rec_unlock(hash_cell_t &cell, lock_t *lock, ulint heap_no) @param cell lock_sys.rec_hash cell of lock @param lock record lock @param offsets storage for rec_get_offsets() +@param made_progress (CELL only) set to true when any record lock bit is + actually cleared; the GLOBAL instantiation never reads it @tparam latch_type how the caller of the function latched lock_sys, GLOBAL or CELL @return true if the cell was latched successfully or if latch_type is GLOBAL, false otherwise */ template bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell, - lock_t *lock, rec_offs *offsets) + lock_t *lock, rec_offs *offsets, + bool *made_progress= nullptr) { DEBUG_SYNC_C("lock_rec_unlock_unmodified_start"); lock_sys.assert_locked(*lock); @@ -4833,6 +4872,8 @@ bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell, continue; unlock_rec: lock_rec_unlock(*cell, lock, i); + if constexpr (latch_type == CELL) + *made_progress= true; } else { @@ -4894,7 +4935,8 @@ and wake up possible other transactions waiting because of these locks. @param trx transaction in XA PREPARE state @param unlock_unmodified must unmodified by the trx records be unlocked or not @return whether all locks were released */ -static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) +static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified, + bool *made_progress) { /* At this point, trx->lock.trx_locks can still be modified by other threads to convert implicit exclusive locks into explicit ones. @@ -4905,6 +4947,10 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) DBUG_ASSERT(trx->state == TRX_STATE_PREPARED); bool all_released= true; + /* Set at every site that frees a lock, including the unlock_unmodified path: + lock_rec_unlock_unmodified() reports through made_progress whether it actually + cleared any bit, since it returns true even when it frees nothing. */ + *made_progress= false; mtr_t mtr{trx}; rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs *offsets= offsets_; @@ -4942,11 +4988,13 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) { lock_rec_dequeue_from_page(lock, false); latch->release(); + *made_progress= true; } else if (supremum_bit) { lock_rec_unlock(*cell, lock, PAGE_HEAP_NO_SUPREMUM); latch->release(); + *made_progress= true; } else if (unlock_unmodified) { @@ -4976,7 +5024,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) if (latch->try_acquire_spin()) { if (!lock_rec_unlock_unmodified(block, cell, lock, - offsets)) + offsets, made_progress)) all_released= false; else latch->release(); @@ -5013,6 +5061,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified) { lock_table_dequeue(lock, false); table->lock_mutex_unlock(); + *made_progress= true; } else all_released= false; @@ -5078,9 +5127,22 @@ void lock_release_on_prepare(trx_t *trx) #ifdef ENABLED_DEBUG_SYNC DBUG_EXECUTE_IF("skip_lock_release_on_prepare_try", goto skip_try;); #endif - for (ulint count= 5; count--; ) - if (lock_release_on_prepare_try(trx, unlock_unmodified)) + /* Same optimistic-release loop as lock_release() (see there). A prepared + transaction is still active, so trx_locks may grow between passes, making + LOCK_RELEASE_MAX_PASSES the effective bound. */ + for (unsigned count= 0, stalls= 0; count < LOCK_RELEASE_MAX_PASSES; count++) + { + bool progressed; + if (lock_release_on_prepare_try(trx, unlock_unmodified, &progressed)) return; + if (progressed) + { + if (stalls) + stalls--; + } + else if (++stalls == LOCK_RELEASE_MAX_STALLS) + break; + } #ifdef ENABLED_DEBUG_SYNC skip_try: #endif