From 0b586ec25b2450ec9ab79c846cf7ea53d77fd6d7 Mon Sep 17 00:00:00 2001
From: Alessandro Vetere <iminelink@gmail.com>
Date: Mon, 29 Jun 2026 18:10:27 +0200
Subject: [PATCH 1/2] MDEV-40129 Retry transient trylock failures in
 lock-release fast paths

The trylock attempts on per-cell lock_sys_t::hash_latch (try_acquire())
and on per-table dict_table_t::lock_mutex_trylock() inside
lock_release_try(), lock_release_on_prepare_try() and
lock_rec_unlock_unmodified() now use a bounded spin loop
(up to LOCK_RELEASE_TRY_SPIN_BUDGET CAS attempts, with MY_RELAX_CPU()
between them) instead of a single CAS attempt.

These paths hold trx->mutex while attempting the trylock, which is the
reverse of the standard order used by lock_rec_convert_impl_to_expl().
Blocking acquisition is therefore unsafe, hence the trylock pattern.
However, a single failed CAS marks the entire pass of lock_release_try()
as unsuccessful, and after 5 such failed passes lock_release() falls
back to exclusive lock_sys.wr_lock() for the whole transaction. That
global wr_lock then blocks every concurrent lock_sys.rd_lock() acquirer
in lock_rec_lock() and lock_table(), producing a server-wide convoy
under heavy concurrency.

The bounded spin (no syscall, no blocking) gives a transient latch
holder time to release without weakening the deadlock-avoidance
guarantee that motivated the trylock pattern. The extra trx->mutex hold
time is bounded by LOCK_RELEASE_TRY_SPIN_BUDGET times the pause cost.

This is a first, still to be fine-tuned implementation. Only the
lock_release_try() path has been positively tested; the
lock_release_on_prepare_try() path is not yet covered.
---
 storage/innobase/include/dict0mem.h  |  4 ++
 storage/innobase/include/lock0lock.h |  4 ++
 storage/innobase/lock/lock0lock.cc   | 59 +++++++++++++++++++++++++---
 3 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 6569de9564647..3d79b41d18e84 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -2157,6 +2157,10 @@ struct dict_table_t {
   void lock_mutex_lock() { lock_latch.wr_lock(ut_d(SRW_LOCK_CALL)); }
   /** Try to acquire exclusive lock_latch */
   bool lock_mutex_trylock() { return lock_latch.wr_lock_try(); }
+  /** Try to acquire exclusive lock_latch with a bounded spin (no syscall,
+  no blocking), for the lock-release fast paths that hold trx->mutex.
+  @return whether lock_latch was acquired */
+  bool lock_mutex_trylock_spin() noexcept;
   /** Release exclusive lock_latch */
   void lock_mutex_unlock() { lock_latch.wr_unlock(); }
   /** Acquire shared lock_latch */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 9c192f2a58644..06cdcf8d21470 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -663,6 +663,10 @@ class lock_sys_t
     /** @return whether this latch is possibly held by any thread */
     bool is_locked() const noexcept { return lock.is_locked(); }
 #endif
+    /** Try to acquire the latch with a bounded spin (no syscall, no
+    blocking), for the lock-release fast paths that hold trx->mutex.
+    @return whether the latch was acquired */
+    bool try_acquire_spin() noexcept;
   };
 
 public:
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 62f6fd58a1b6b..657367df23a99 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -48,6 +48,7 @@ Created 5/7/1996 Heikki Tuuri
 #include "que0que.h"
 #include "scope.h"
 #include "buf0buf.h"
+#include "my_cpu.h"
 #include <debug_sync.h>
 #include <mysql/service_thd_mdl.h>
 
@@ -4529,6 +4530,52 @@ lock_rec_unlock(
 					g.cell(), first_lock, heap_no);
 }
 
+/** Spin budget for lock_release_try(), lock_release_on_prepare_try() and
+lock_rec_unlock_unmodified() trylock attempts on per-cell and per-table
+latches.
+
+Those paths hold trx->mutex while attempting the trylock (the reverse of
+the standard order used by lock_rec_convert_impl_to_expl()), so blocking
+acquisition would risk deadlock. A single CAS attempt fails too readily
+under contention: any one failure across the trx's record/table locks
+marks the whole try-pass as unsuccessful, and after 5 such passes
+lock_release() escalates to lock_sys.wr_lock() for the entire trx, which
+then blocks every concurrent lock_sys.rd_lock() acquirer in lock_rec_lock()
+and lock_table().
+
+A bounded spin (CAS, PAUSE between attempts, no syscall, no blocking) lets a
+transient holder release without changing the deadlock-avoidance guarantee. The
+upper bound on extra trx->mutex hold time is LOCK_RELEASE_TRY_SPIN_BUDGET
+* pause-cost (tens to ~100ns per iteration, microarchitecture-dependent). */
+static constexpr unsigned LOCK_RELEASE_TRY_SPIN_BUDGET= 16;
+static_assert(LOCK_RELEASE_TRY_SPIN_BUDGET, "the !--spin loops would underflow");
+
+ATTRIBUTE_NOINLINE
+bool lock_sys_t::hash_latch::try_acquire_spin() noexcept
+{
+  for (unsigned spin= LOCK_RELEASE_TRY_SPIN_BUDGET;;)
+  {
+    if (try_acquire())
+      return true;
+    if (!--spin)
+      return false;
+    MY_RELAX_CPU();
+  }
+}
+
+ATTRIBUTE_NOINLINE
+bool dict_table_t::lock_mutex_trylock_spin() noexcept
+{
+  for (unsigned spin= LOCK_RELEASE_TRY_SPIN_BUDGET;;)
+  {
+    if (lock_mutex_trylock())
+      return true;
+    if (!--spin)
+      return false;
+    MY_RELAX_CPU();
+  }
+}
+
 /** Release the explicit locks of a committing transaction,
 and release possible other transactions waiting because of these locks.
 @return whether the operation succeeded */
@@ -4573,7 +4620,7 @@ static bool lock_release_try(trx_t *trx)
       auto &lock_hash= lock_sys.hash_get(lock->type_mode);
       auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
       auto latch= lock_sys_t::hash_table::latch(cell);
-      if (!latch->try_acquire())
+      if (!latch->try_acquire_spin())
         all_released= false;
       else
       {
@@ -4588,7 +4635,7 @@ static bool lock_release_try(trx_t *trx)
       ut_ad(table->id >= DICT_HDR_FIRST_ID ||
             (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
             trx->dict_operation || trx->was_dict_operation);
-      if (!table->lock_mutex_trylock())
+      if (!table->lock_mutex_trylock_spin())
         all_released= false;
       else
       {
@@ -4831,7 +4878,7 @@ bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell,
           computed cell address invalid */
           cell= lock_sys.rec_hash.cell_get(
               lock->un_member.rec_lock.page_id.fold());
-          if (!lock_sys_t::hash_table::latch(cell)->try_acquire())
+          if (!lock_sys_t::hash_table::latch(cell)->try_acquire_spin())
             return false;
         }
         if (lock->trx != impl_trx)
@@ -4889,7 +4936,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
       const auto fold = lock->un_member.rec_lock.page_id.fold();
       auto cell= lock_sys.rec_hash.cell_get(fold);
       auto latch= lock_sys_t::hash_table::latch(cell);
-      if (latch->try_acquire())
+      if (latch->try_acquire_spin())
       {
         if (!rec_granted_exclusive_not_gap)
         {
@@ -4926,7 +4973,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
             computed cell address invalid */
             cell= lock_sys.rec_hash.cell_get(fold);
             latch= lock_sys_t::hash_table::latch(cell);
-            if (latch->try_acquire())
+            if (latch->try_acquire_spin())
             {
               if (!lock_rec_unlock_unmodified<CELL>(block, cell, lock,
                                                      offsets))
@@ -4962,7 +5009,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
       switch (lock->mode()) {
       case LOCK_IS:
       case LOCK_S:
-        if (table->lock_mutex_trylock())
+        if (table->lock_mutex_trylock_spin())
         {
           lock_table_dequeue(lock, false);
           table->lock_mutex_unlock();

From 7c0e3e987db1543c084254db995acd7ecccf4ea5 Mon Sep 17 00:00:00 2001
From: Alessandro Vetere <iminelink@gmail.com>
Date: Tue, 23 Jun 2026 13:09:45 +0200
Subject: [PATCH 2/2] MDEV-40209 Escalate lock-release via a saturating stall
 counter

lock_release() and lock_release_on_prepare() release a committing or preparing
transaction's explicit locks under the shared lock_sys.rd_lock(), taking each
per-cell hash latch and per-table lock mutex with a trylock because trx->mutex
is held in the reverse of the normal latch order. A single failed trylock
marked the whole pass unsuccessful, and after a fixed cap of 5 such passes the
code escalated to the exclusive lock_sys.wr_lock() for the whole transaction.
Under concurrency the trylocks fail transiently, so the cap escalated
transactions that were still steadily releasing locks, not just stuck ones; the
exclusive latch then blocks every concurrent lock_sys.rd_lock() acquirer in
lock_rec_lock() and lock_table(), producing a convoy. The chance of hitting the
cap rises with both the contention level and the number of latches a
transaction must trylock per pass.

Replace the fixed cap with a saturating stall counter (LOCK_RELEASE_MAX_STALLS,
incremented on a no-progress pass, decremented on progress, floored at zero)
that escalates a genuinely stuck transaction after 5 net stalls, as the fixed
cap did, while leaving a transaction that keeps making progress to finish under
the shared latch. A hard LOCK_RELEASE_MAX_PASSES ceiling bounds the loop
independently, for the case where concurrent activity keeps adding locks (e.g.
implicit-to-explicit conversion during XA PREPARE) so that progress never
converges. The _try functions report progress through an out-parameter computed
under trx->mutex, so trx->lock.trx_locks is never read unlatched.
---
 storage/innobase/lock/lock0lock.cc | 78 +++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 8 deletions(-)

diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 657367df23a99..49bd44004031d 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -4576,10 +4576,27 @@ bool dict_table_t::lock_mutex_trylock_spin() noexcept
   }
 }
 
+/** Escalate the optimistic lock-release loop to the exclusive lock_sys.latch
+once no-progress passes outpace progress passes by this much: a pass that
+frees no lock increments the counter, a pass that frees one decrements it
+(floored at 0). A fully stuck transaction still escalates after 5 passes, as
+the previous fixed cap did. */
+static constexpr unsigned LOCK_RELEASE_MAX_STALLS= 5;
+
+/** Hard ceiling on optimistic release passes, independent of the stall
+counter: bounds the loop when concurrent activity keeps adding to the lock set
+(implicit-to-explicit conversion during XA PREPARE, or lock_rec_move()
+relocation) so that progress never converges. Set well above
+LOCK_RELEASE_MAX_STALLS so it only caps sustained churn. */
+static constexpr unsigned LOCK_RELEASE_MAX_PASSES= 100;
+
 /** Release the explicit locks of a committing transaction,
 and release possible other transactions waiting because of these locks.
+@param trx            a committing transaction
+@param made_progress  set to whether this pass freed a lock, computed under
+                      trx->mutex (so the caller needs no unlatched read)
 @return whether the operation succeeded */
-static bool lock_release_try(trx_t *trx)
+static bool lock_release_try(trx_t *trx, bool *made_progress)
 {
   /* At this point, trx->lock.trx_locks cannot be modified by other
   threads, because our transaction has been committed.
@@ -4593,6 +4610,7 @@ static bool lock_release_try(trx_t *trx)
   DBUG_ASSERT(!trx->is_referenced());
 
   bool all_released= true;
+  *made_progress= false;
 restart:
   ulint count= 1000;
   /* We will not attempt hardware lock elision (memory transaction)
@@ -4626,6 +4644,7 @@ static bool lock_release_try(trx_t *trx)
       {
         lock_rec_dequeue_from_page(lock, false);
         latch->release();
+        *made_progress= true;
       }
     }
     else
@@ -4641,6 +4660,7 @@ static bool lock_release_try(trx_t *trx)
       {
         lock_table_dequeue(lock, false);
         table->lock_mutex_unlock();
+        *made_progress= true;
       }
     }
 
@@ -4670,9 +4690,25 @@ void lock_release(trx_t *trx)
 #endif
   ulint count;
 
-  for (count= 5; count--; )
-    if (lock_release_try(trx))
+  /* Optimistic release under the shared latch. lock_release_try() reports
+  whether it freed a lock, computed under trx->mutex: we must not read
+  trx_locks here, as a concurrent lock_rec_move() can change it between
+  passes. Escalate on LOCK_RELEASE_MAX_STALLS net stalls or
+  LOCK_RELEASE_MAX_PASSES total passes. */
+  unsigned stalls= 0;
+  for (count= 0; count < LOCK_RELEASE_MAX_PASSES; count++)
+  {
+    bool progressed;
+    if (lock_release_try(trx, &progressed))
       goto released;
+    if (progressed)
+    {
+      if (stalls)
+        stalls--;
+    }
+    else if (++stalls == LOCK_RELEASE_MAX_STALLS)
+      break;
+  }
 
   /* Fall back to acquiring lock_sys.latch in exclusive mode */
 restart:
@@ -4798,13 +4834,16 @@ static void lock_rec_unlock(hash_cell_t &cell, lock_t *lock, ulint heap_no)
 @param  cell       lock_sys.rec_hash cell of lock
 @param  lock       record lock
 @param  offsets    storage for rec_get_offsets()
+@param  made_progress (CELL only) set to true when any record lock bit is
+                   actually cleared; the GLOBAL instantiation never reads it
 @tparam latch_type how the caller of the function latched lock_sys,
                    GLOBAL or CELL
 @return true if the cell was latched successfully or if latch_type is GLOBAL,
         false otherwise */
 template <lock_sys_latch_type latch_type>
 bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell,
-                                lock_t *lock, rec_offs *offsets)
+                                lock_t *lock, rec_offs *offsets,
+                                bool *made_progress= nullptr)
 {
   DEBUG_SYNC_C("lock_rec_unlock_unmodified_start");
   lock_sys.assert_locked(*lock);
@@ -4833,6 +4872,8 @@ bool lock_rec_unlock_unmodified(buf_block_t *block, hash_cell_t *cell,
           continue;
       unlock_rec:
         lock_rec_unlock(*cell, lock, i);
+        if constexpr (latch_type == CELL)
+          *made_progress= true;
       }
       else
       {
@@ -4894,7 +4935,8 @@ and wake up possible other transactions waiting because of these locks.
 @param trx               transaction in XA PREPARE state
 @param unlock_unmodified must unmodified by the trx records be unlocked or not
 @return whether all locks were released */
-static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
+static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified,
+                                        bool *made_progress)
 {
   /* At this point, trx->lock.trx_locks can still be modified by other
   threads to convert implicit exclusive locks into explicit ones.
@@ -4905,6 +4947,10 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
   DBUG_ASSERT(trx->state == TRX_STATE_PREPARED);
 
   bool all_released= true;
+  /* Set at every site that frees a lock, including the unlock_unmodified path:
+  lock_rec_unlock_unmodified() reports through made_progress whether it actually
+  cleared any bit, since it returns true even when it frees nothing. */
+  *made_progress= false;
   mtr_t mtr{trx};
   rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
   rec_offs *offsets= offsets_;
@@ -4942,11 +4988,13 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
         {
           lock_rec_dequeue_from_page(lock, false);
           latch->release();
+          *made_progress= true;
         }
         else if (supremum_bit)
         {
           lock_rec_unlock(*cell, lock, PAGE_HEAP_NO_SUPREMUM);
           latch->release();
+          *made_progress= true;
         }
         else if (unlock_unmodified)
         {
@@ -4976,7 +5024,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
             if (latch->try_acquire_spin())
             {
               if (!lock_rec_unlock_unmodified<CELL>(block, cell, lock,
-                                                     offsets))
+                                                     offsets, made_progress))
                 all_released= false;
               else
                 latch->release();
@@ -5013,6 +5061,7 @@ static bool lock_release_on_prepare_try(trx_t *trx, bool unlock_unmodified)
         {
           lock_table_dequeue(lock, false);
           table->lock_mutex_unlock();
+          *made_progress= true;
         }
         else
           all_released= false;
@@ -5078,9 +5127,22 @@ void lock_release_on_prepare(trx_t *trx)
 #ifdef ENABLED_DEBUG_SYNC
   DBUG_EXECUTE_IF("skip_lock_release_on_prepare_try", goto skip_try;);
 #endif
-  for (ulint count= 5; count--; )
-    if (lock_release_on_prepare_try(trx, unlock_unmodified))
+  /* Same optimistic-release loop as lock_release() (see there). A prepared
+  transaction is still active, so trx_locks may grow between passes, making
+  LOCK_RELEASE_MAX_PASSES the effective bound. */
+  for (unsigned count= 0, stalls= 0; count < LOCK_RELEASE_MAX_PASSES; count++)
+  {
+    bool progressed;
+    if (lock_release_on_prepare_try(trx, unlock_unmodified, &progressed))
       return;
+    if (progressed)
+    {
+      if (stalls)
+        stalls--;
+    }
+    else if (++stalls == LOCK_RELEASE_MAX_STALLS)
+      break;
+  }
 #ifdef ENABLED_DEBUG_SYNC
 skip_try:
 #endif