From 8f6bd8da21eb9a7e7195bbb0a3f383c22872f797 Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Wed, 1 Apr 2026 18:02:24 -0700 Subject: [PATCH 1/6] Fix ListPushPopStressTest to handle thread pool starvation --- test/Garnet.test/RespListTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/Garnet.test/RespListTests.cs b/test/Garnet.test/RespListTests.cs index 91eb4682645..7f1826aa629 100644 --- a/test/Garnet.test/RespListTests.cs +++ b/test/Garnet.test/RespListTests.cs @@ -1076,7 +1076,7 @@ public void CanHandleNoPrexistentKey() [Test] [Repeat(10)] - public void ListPushPopStressTest() + public async Task ListPushPopStressTest() { using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); @@ -1102,22 +1102,22 @@ public void ListPushPopStressTest() await db.ListLeftPushAsync(key, j).ConfigureAwait(false); }); - tasks[i + 1] = Task.Run(() => + tasks[i + 1] = Task.Run(async () => { var key = keyArray[idx >> 1]; for (int j = 0; j < ppCount; j++) { - var value = db.ListRightPop(key); + var value = await db.ListRightPopAsync(key).ConfigureAwait(false); while (value.IsNull) { - Thread.Yield(); - value = db.ListRightPop(key); + await Task.Delay(1).ConfigureAwait(false); + value = await db.ListRightPopAsync(key).ConfigureAwait(false); } ClassicAssert.IsTrue((int)value >= 0 && (int)value < ppCount, "Pop value inconsistency"); } }); } - Task.WaitAll(tasks); + await Task.WhenAll(tasks); foreach (var key in keyArray) { From d07482bf2327558bd7ced00a94dc5acfe7b783c4 Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Wed, 1 Apr 2026 19:58:09 -0700 Subject: [PATCH 2/6] Fix livelock in page allocation under extreme memory pressure with heap objects Under lowMemory settings (LogMemorySize=2k, PageSize=512) with heavy object operations, the server would livelock in TryAllocateRetryNow because: 1. IsBeyondSizeLimitAndCanEvict(addingPage:true) returned true for soft memory limits (heap objects >> target), blocking page allocations via NeedToWaitForClose. Evicting pages cannot reduce heap memory from objects that are immediately cloned back via CopyUpdate, creating an infinite RETRY_NOW spin. 2. IssueShiftAddress skipped HeadAddress advancement when IsBeyondSizeLimit was true, even at the hard MaxAllocatedPageCount limit. This prevented the TryAllocateRetryNow spin from resolving because no thread advanced HeadAddress. 3. TryAllocateRetryNow had no backoff, consuming 70%+ CPU in a tight spin loop when allocation kept returning RETRY_NOW. Fixes: - IsBeyondSizeLimitAndCanEvict: when addingPage=true, only check the hard page limit (MaxAllocatedPageCount). Soft limits are handled asynchronously by the resizer task, signaled via NeedToWaitForClose. - NeedToWaitForClose: separate hard limit (block) from soft limit (signal only). - IssueShiftAddress: always advance HeadAddress at MaxAllocatedPageCount, regardless of IsBeyondSizeLimit state. - TryAllocateRetryNow: progressive backoff (Thread.Yield -> Thread.Sleep(1)) to reduce CPU usage during unavoidable spins. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../cs/src/core/Allocator/AllocatorBase.cs | 54 ++++++++++++++----- .../src/core/Index/Common/LogSizeTracker.cs | 10 +++- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs index 9fe270aa6bc..613cfab8493 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs @@ -1071,16 +1071,23 @@ void IssueShiftAddress(long pageIndex, bool needSHA) // First check whether we need to shift HeadAddress. If we are not forcing for flush and have a logSizeTracker that's over budget then we have already issued // a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize). var desiredHeadAddress = HeadAddress; - if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) + var headPage = GetPage(desiredHeadAddress); + + // Always advance HeadAddress when at or beyond MaxAllocatedPageCount (hard page limit). + // Without this, when the size tracker is over budget (IsBeyondSizeLimit is true), the condition + // below would skip HeadAddress advancement entirely, causing TryAllocateRetryNow to livelock + // because no thread advances HeadAddress and the resizer task may not run frequently enough. + if (pageIndex - headPage >= MaxAllocatedPageCount) { - var headPage = GetPage(desiredHeadAddress); - if (pageIndex - headPage >= MaxAllocatedPageCount) - { - // Snapping to start of page rather than PageHeader.Size means that HA being middle-of-page implies a partial page. - desiredHeadAddress = GetLogicalAddressOfStartOfPage(headPage + 1); - if (desiredHeadAddress > tailAddress) - desiredHeadAddress = tailAddress; - } + // Snapping to start of page rather than PageHeader.Size means that HA being middle-of-page implies a partial page. + desiredHeadAddress = GetLogicalAddressOfStartOfPage(headPage + 1); + if (desiredHeadAddress > tailAddress) + desiredHeadAddress = tailAddress; + } + else if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) + { + // For non-hard-limit shifts when the size tracker is not over budget, advance HeadAddress + // as needed. When IsBeyondSizeLimit is true, the resizer task manages HeadAddress advancement. } // Check whether we need to shift ROA based on desiredHeadAddress. @@ -1126,10 +1133,23 @@ private bool NeedToWaitForClose(int page, out bool needSHA) } needSHA = false; - if (logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimitAndCanEvict(addingPage: true)) + if (logSizeTracker is null) return false; - logSizeTracker.Signal(); - return true; + + // Check the hard page limit: if we're at MaxAllocatedPageCount, we must wait for eviction. + if (logSizeTracker.IsBeyondSizeLimitAndCanEvict(addingPage: true)) + { + logSizeTracker.Signal(); + return true; + } + + // For soft limit violations (e.g. heap objects exceed target but page count is within range), + // signal the resizer to evict asynchronously but allow the allocation to proceed. Blocking here + // would cause a livelock when heap memory dominates the budget, because evicting pages cannot + // free heap objects that are immediately cloned back via CopyUpdate. + if (logSizeTracker.IsBeyondSizeLimit) + logSizeTracker.Signal(); + return false; } [MethodImpl(MethodImplOptions.NoInlining)] @@ -1249,12 +1269,20 @@ private long TryAllocate(int numSlots = 1) [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryAllocateRetryNow(int numSlots, out long logicalAddress) { + var spinCount = 0; while ((logicalAddress = TryAllocate(numSlots)) < 0) { // -1: RETRY_NOW _ = TryComplete(); epoch.ProtectAndDrain(); - _ = Thread.Yield(); + + // Use progressive backoff to prevent livelock under extreme memory pressure. + // Initial spins use Thread.Yield() for low latency. After many spins, use Thread.Sleep(1) + // to release the CPU and give the resizer task and flush IO time to complete. + if (++spinCount <= 10) + _ = Thread.Yield(); + else + Thread.Sleep(1); } // 0: RETRY_LATER diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs index a56c86183f5..ee031711dfc 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs @@ -92,7 +92,8 @@ public override string ToString() public bool IsBeyondSizeLimit => TotalSize > highTargetSize; /// Return true if the total size is outside the target plus delta *and* we have pages we can (partially or completely) evict - /// If true, we are allocating a new page. Otherwise, we are called when adding or growing a new + /// If true, we are allocating a new page and only the hard page limit (MaxAllocatedPageCount) is checked. + /// Otherwise, we are called when adding or growing a new , and both the total size and page count are checked. /// This should be used only for non-Recovery, because Recovery does not set up HeadAddress and TailAddress before this is called. public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false) { @@ -105,6 +106,13 @@ public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false) if (addingPage && numPages == logAccessor.allocatorBase.MaxAllocatedPageCount) return true; + // When addingPage is true, don't block based on the soft memory limit (TotalSize > highTargetSize). + // Blocking page allocations when the over-budget is from heap objects causes a livelock: evicting pages + // cannot free heap objects that are cloned back via CopyUpdate. The resizer task handles soft-limit + // eviction asynchronously; the caller (NeedToWaitForClose) signals it separately. + if (addingPage) + return false; + // Otherwise, we need at least MinResizeTargetPageCount to be able to evict anything. return (TotalSize > highTargetSize) && numPages > MinResizeTargetPageCount; } From 511d7b4b3a79f272374862d36abe75a162c29076 Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Wed, 1 Apr 2026 21:14:20 -0700 Subject: [PATCH 3/6] Fix object log read assertion crash under heavy eviction pressure CircularDiskReadBuffer.OnBeginRecord compared recordFilePosition.word against bufferFilePosition.word using the full ulong word, but this word includes the piggybacked ObjectSizeHighByte field (bits 56-63) which differs between records and is unrelated to file position. This caused spurious assertion failures in Debug mode under heavy object eviction with storage tiering enabled. Additionally, use saturating subtraction via CurrentAddress (segment + offset only) to compute the increment, avoiding ulong underflow when positions have small misalignments from sector-alignment padding at partial flush boundaries. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ObjectSerialization/CircularDiskReadBuffer.cs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs index 58bcb072f6f..ed5b624a808 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs @@ -177,10 +177,18 @@ internal bool OnBeginRecord(ObjectLogFilePositionInfo recordFilePosition) while (true) { var bufferFilePosition = buffer.GetCurrentFilePosition(); - Debug.Assert(recordFilePosition.word >= bufferFilePosition.word, $"Record file position ({recordFilePosition}) should be >= ongoing position {bufferFilePosition}"); + + // Compare using CurrentAddress (segment + offset only) rather than the full word, because the word includes + // the piggybacked ObjectSizeHighByte which differs between records and is unrelated to file position. + Debug.Assert(recordFilePosition.CurrentAddress >= bufferFilePosition.CurrentAddress, + $"Record file position ({recordFilePosition}) should be >= ongoing position {bufferFilePosition}"); Debug.Assert(recordFilePosition.SegmentId == bufferFilePosition.SegmentId, $"Record file segment ({recordFilePosition.SegmentId}) should == ongoing position {bufferFilePosition.SegmentId}"); - var increment = recordFilePosition - bufferFilePosition; - Debug.Assert(increment < objectLogDevice.SectorSize, $"Increment {increment} must be less than SectorSize ({objectLogDevice.SectorSize})"); + + // Use saturating subtraction via CurrentAddress to avoid underflow when ObjectSizeHighByte differs. + var recordAddr = recordFilePosition.CurrentAddress; + var bufferAddr = bufferFilePosition.CurrentAddress; + var increment = recordAddr >= bufferAddr ? recordAddr - bufferAddr : 0; + Debug.Assert(increment < (ulong)objectLogDevice.SectorSize, $"Increment {increment} must be less than SectorSize ({objectLogDevice.SectorSize})"); // We might cleanly align to the start of the next buffer, if there was a flush that ended on a buffer boundary. // Otherwise, we should always be within the current buffer. We should only do this "continue" once. From 5b7322aa30e546fae30566d75f95a6e84b94e6a3 Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Thu, 2 Apr 2026 11:19:30 -0700 Subject: [PATCH 4/6] Fix heap size double-subtraction during concurrent eviction and CopyUpdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: CacheSizeTracker used SetLogSizeTracker() which only sets the logSizeTracker field but does NOT subscribe the tracker as onEvictionObserver. Without the eviction observer, OnNext is never called when pages are evicted. Instead, ResizeIfNeeded pre-computes heapTrimmedSize by scanning records BEFORE eviction, then subtracts it AFTER ShiftAddresses completes. But between the scan and subtraction, concurrent CopyUpdate operations on other threads can clear the source record's value object AND decrement heapSize via AddHeapSize(sizeAdjustment) — causing the same heap memory to be subtracted twice, driving heapSize negative (e.g. -80) and triggering a DebugAssertException that crashes the server session. Fix: Use SubscribeEvictions() instead of SetLogSizeTracker(). This sets both onEvictionObserver AND logSizeTracker, enabling the OnNext callback to subtract each record's CURRENT HeapMemorySize at actual eviction time. Records already cleared by concurrent CopyUpdate will have HeapMemorySize=0, avoiding the double-subtraction. Remove the stale pre-computed heapSize.Increment(-heapTrimmedSize) from ResizeIfNeeded since OnNext now handles it correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- libs/server/Storage/SizeTracker/CacheSizeTracker.cs | 8 +++++--- .../Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/libs/server/Storage/SizeTracker/CacheSizeTracker.cs b/libs/server/Storage/SizeTracker/CacheSizeTracker.cs index 96c12069f6a..51956063fc6 100644 --- a/libs/server/Storage/SizeTracker/CacheSizeTracker.cs +++ b/libs/server/Storage/SizeTracker/CacheSizeTracker.cs @@ -57,19 +57,21 @@ public CacheSizeTracker(TsavoriteKV store, long Debug.Assert(store != null); Debug.Assert(targetSize > 0 || readCacheTargetSize > 0); - // Subscribe to the eviction notifications. We don't hang onto the LogSubscribeDisposable because the CacheSizeTracker is never disposed once created. + // Subscribe to eviction notifications so OnNext is called when pages are evicted, allowing the tracker to subtract + // heap sizes at the correct time (when records are actually evicted rather than pre-computed). We don't hang onto + // the LogSubscribeDisposable because the CacheSizeTracker is never disposed once created. if (targetSize > 0) { mainLogTracker = new LogSizeTracker(store.Log, targetSize, targetSize / HighTargetSizeDeltaFraction, targetSize / LowTargetSizeDeltaFraction, loggerFactory?.CreateLogger("MainLogSizeTracker")); - store.Log.SetLogSizeTracker(mainLogTracker); + store.Log.SubscribeEvictions(mainLogTracker); } if (store.ReadCache != null && readCacheTargetSize > 0) { readCacheTracker = new LogSizeTracker(store.ReadCache, readCacheTargetSize, readCacheTargetSize / HighTargetSizeDeltaFraction, readCacheTargetSize / LowTargetSizeDeltaFraction, loggerFactory?.CreateLogger("ReadCacheSizeTracker")); - store.ReadCache.SetLogSizeTracker(readCacheTracker); + store.ReadCache.SubscribeEvictions(readCacheTracker); } } diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs index ee031711dfc..3c3e7f128b7 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs @@ -222,7 +222,7 @@ public void IncrementSize(long size) heapSize.Increment(size); if (size > 0 && IsBeyondSizeLimitAndCanEvict()) resizeTaskEvent.Set(); - Debug.Assert(size > 0 || heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} in Resize"); + Debug.Assert(size > 0 || heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} after IncrementSize({size})"); } } @@ -402,9 +402,10 @@ private void ResizeIfNeeded(CancellationToken cancellationToken) // ShiftHeadAddress caps the new HeadAddress at FlushedUntilAddress. Wait until the SHA eviction is complete to avoid going further over budget. logAccessor.ShiftAddresses(readOnlyAddress, headAddress, waitForEviction: true); - // Now subtract what we were able to trim from heapSize. Inline page total size is tracked separately in logAccessor.MemorySizeBytes. - heapSize.Increment(-heapTrimmedSize); - Debug.Assert(heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} in Resize"); + // Heap size subtraction is handled by the OnNext eviction callback (called during ShiftAddresses → OnPagesClosed → MemoryPageScan), + // which subtracts each record's CURRENT HeapMemorySize at eviction time. This avoids double-counting with concurrent CopyUpdate + // operations that may have already cleared the source record and decremented heapSize via AddHeapSize(sizeAdjustment). + // Note: heapTrimmedSize from DetermineEvictionRange is used only for the eviction range calculation, not for the actual subtraction. // Calculate the number of trimmed pages and report the new expected AllocatedPageCount here, since our last iteration (which may have been the only one) // would have returned isComplete and thus we didn't wait for the actual eviction. From 8c5265842045a6f854809a3d56d3cc8ce61fabbc Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Thu, 2 Apr 2026 18:49:18 -0700 Subject: [PATCH 5/6] Simplify fix: only the root cause (SubscribeEvictions + remove stale heapTrimmedSize) Remove unnecessary changes that were workarounds for symptoms of the heap accounting bug, not separate issues: - AllocatorBase.cs: revert NeedToWaitForClose, IssueShiftAddress, and TryAllocateRetryNow changes (the livelock was caused by heapSize growing unbounded due to double-subtraction, not by the soft limit logic itself) - CircularDiskReadBuffer.cs: revert OnBeginRecord assertion change (the assertion only fired when heapSize was wrong, causing incorrect eviction decisions that led to corrupted object log positions) - LogSizeTracker.cs: revert IsBeyondSizeLimitAndCanEvict and IncrementSize clamping changes (no longer needed with correct accounting) The actual fix is two lines in two files: 1. CacheSizeTracker.cs: SubscribeEvictions() instead of SetLogSizeTracker() 2. LogSizeTracker.cs: remove heapSize.Increment(-heapTrimmedSize) from ResizeIfNeeded (OnNext callback now handles it at eviction time) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Storage/SizeTracker/CacheSizeTracker.cs | 4 +- .../cs/src/core/Allocator/AllocatorBase.cs | 54 +++++-------------- .../CircularDiskReadBuffer.cs | 14 ++--- .../src/core/Index/Common/LogSizeTracker.cs | 19 ++----- 4 files changed, 22 insertions(+), 69 deletions(-) diff --git a/libs/server/Storage/SizeTracker/CacheSizeTracker.cs b/libs/server/Storage/SizeTracker/CacheSizeTracker.cs index 51956063fc6..2efed592666 100644 --- a/libs/server/Storage/SizeTracker/CacheSizeTracker.cs +++ b/libs/server/Storage/SizeTracker/CacheSizeTracker.cs @@ -57,9 +57,7 @@ public CacheSizeTracker(TsavoriteKV store, long Debug.Assert(store != null); Debug.Assert(targetSize > 0 || readCacheTargetSize > 0); - // Subscribe to eviction notifications so OnNext is called when pages are evicted, allowing the tracker to subtract - // heap sizes at the correct time (when records are actually evicted rather than pre-computed). We don't hang onto - // the LogSubscribeDisposable because the CacheSizeTracker is never disposed once created. + // Subscribe to the eviction notifications. We don't hang onto the LogSubscribeDisposable because the CacheSizeTracker is never disposed once created. if (targetSize > 0) { mainLogTracker = new LogSizeTracker(store.Log, targetSize, diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs index cfe8cdfced2..2a47f547a68 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs @@ -1071,23 +1071,16 @@ void IssueShiftAddress(long pageIndex, bool needSHA) // First check whether we need to shift HeadAddress. If we are not forcing for flush and have a logSizeTracker that's over budget then we have already issued // a shift if needed (and allowed by allocated page count); otherwise make sure we stay in the MaxAllocatedPageCount (which may be less than BufferSize). var desiredHeadAddress = HeadAddress; - var headPage = GetPage(desiredHeadAddress); - - // Always advance HeadAddress when at or beyond MaxAllocatedPageCount (hard page limit). - // Without this, when the size tracker is over budget (IsBeyondSizeLimit is true), the condition - // below would skip HeadAddress advancement entirely, causing TryAllocateRetryNow to livelock - // because no thread advances HeadAddress and the resizer task may not run frequently enough. - if (pageIndex - headPage >= MaxAllocatedPageCount) - { - // Snapping to start of page rather than PageHeader.Size means that HA being middle-of-page implies a partial page. - desiredHeadAddress = GetLogicalAddressOfStartOfPage(headPage + 1); - if (desiredHeadAddress > tailAddress) - desiredHeadAddress = tailAddress; - } - else if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) + if (needSHA || logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimit) { - // For non-hard-limit shifts when the size tracker is not over budget, advance HeadAddress - // as needed. When IsBeyondSizeLimit is true, the resizer task manages HeadAddress advancement. + var headPage = GetPage(desiredHeadAddress); + if (pageIndex - headPage >= MaxAllocatedPageCount) + { + // Snapping to start of page rather than PageHeader.Size means that HA being middle-of-page implies a partial page. + desiredHeadAddress = GetLogicalAddressOfStartOfPage(headPage + 1); + if (desiredHeadAddress > tailAddress) + desiredHeadAddress = tailAddress; + } } // Check whether we need to shift ROA based on desiredHeadAddress. @@ -1127,23 +1120,10 @@ private bool NeedToWaitForClose(int page, out bool needSHA) } needSHA = false; - if (logSizeTracker is null) + if (logSizeTracker is null || !logSizeTracker.IsBeyondSizeLimitAndCanEvict(addingPage: true)) return false; - - // Check the hard page limit: if we're at MaxAllocatedPageCount, we must wait for eviction. - if (logSizeTracker.IsBeyondSizeLimitAndCanEvict(addingPage: true)) - { - logSizeTracker.Signal(); - return true; - } - - // For soft limit violations (e.g. heap objects exceed target but page count is within range), - // signal the resizer to evict asynchronously but allow the allocation to proceed. Blocking here - // would cause a livelock when heap memory dominates the budget, because evicting pages cannot - // free heap objects that are immediately cloned back via CopyUpdate. - if (logSizeTracker.IsBeyondSizeLimit) - logSizeTracker.Signal(); - return false; + logSizeTracker.Signal(); + return true; } [MethodImpl(MethodImplOptions.NoInlining)] @@ -1263,20 +1243,12 @@ private long TryAllocate(int numSlots = 1) [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool TryAllocateRetryNow(int numSlots, out long logicalAddress) { - var spinCount = 0; while ((logicalAddress = TryAllocate(numSlots)) < 0) { // -1: RETRY_NOW _ = TryComplete(); epoch.ProtectAndDrain(); - - // Use progressive backoff to prevent livelock under extreme memory pressure. - // Initial spins use Thread.Yield() for low latency. After many spins, use Thread.Sleep(1) - // to release the CPU and give the resizer task and flush IO time to complete. - if (++spinCount <= 10) - _ = Thread.Yield(); - else - Thread.Sleep(1); + _ = Thread.Yield(); } // 0: RETRY_LATER diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs index ed5b624a808..58bcb072f6f 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/ObjectSerialization/CircularDiskReadBuffer.cs @@ -177,18 +177,10 @@ internal bool OnBeginRecord(ObjectLogFilePositionInfo recordFilePosition) while (true) { var bufferFilePosition = buffer.GetCurrentFilePosition(); - - // Compare using CurrentAddress (segment + offset only) rather than the full word, because the word includes - // the piggybacked ObjectSizeHighByte which differs between records and is unrelated to file position. - Debug.Assert(recordFilePosition.CurrentAddress >= bufferFilePosition.CurrentAddress, - $"Record file position ({recordFilePosition}) should be >= ongoing position {bufferFilePosition}"); + Debug.Assert(recordFilePosition.word >= bufferFilePosition.word, $"Record file position ({recordFilePosition}) should be >= ongoing position {bufferFilePosition}"); Debug.Assert(recordFilePosition.SegmentId == bufferFilePosition.SegmentId, $"Record file segment ({recordFilePosition.SegmentId}) should == ongoing position {bufferFilePosition.SegmentId}"); - - // Use saturating subtraction via CurrentAddress to avoid underflow when ObjectSizeHighByte differs. - var recordAddr = recordFilePosition.CurrentAddress; - var bufferAddr = bufferFilePosition.CurrentAddress; - var increment = recordAddr >= bufferAddr ? recordAddr - bufferAddr : 0; - Debug.Assert(increment < (ulong)objectLogDevice.SectorSize, $"Increment {increment} must be less than SectorSize ({objectLogDevice.SectorSize})"); + var increment = recordFilePosition - bufferFilePosition; + Debug.Assert(increment < objectLogDevice.SectorSize, $"Increment {increment} must be less than SectorSize ({objectLogDevice.SectorSize})"); // We might cleanly align to the start of the next buffer, if there was a flush that ended on a buffer boundary. // Otherwise, we should always be within the current buffer. We should only do this "continue" once. diff --git a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs index 3c3e7f128b7..d1fd55c8c44 100644 --- a/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs +++ b/libs/storage/Tsavorite/cs/src/core/Index/Common/LogSizeTracker.cs @@ -92,8 +92,7 @@ public override string ToString() public bool IsBeyondSizeLimit => TotalSize > highTargetSize; /// Return true if the total size is outside the target plus delta *and* we have pages we can (partially or completely) evict - /// If true, we are allocating a new page and only the hard page limit (MaxAllocatedPageCount) is checked. - /// Otherwise, we are called when adding or growing a new , and both the total size and page count are checked. + /// If true, we are allocating a new page. Otherwise, we are called when adding or growing a new /// This should be used only for non-Recovery, because Recovery does not set up HeadAddress and TailAddress before this is called. public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false) { @@ -106,13 +105,6 @@ public bool IsBeyondSizeLimitAndCanEvict(bool addingPage = false) if (addingPage && numPages == logAccessor.allocatorBase.MaxAllocatedPageCount) return true; - // When addingPage is true, don't block based on the soft memory limit (TotalSize > highTargetSize). - // Blocking page allocations when the over-budget is from heap objects causes a livelock: evicting pages - // cannot free heap objects that are cloned back via CopyUpdate. The resizer task handles soft-limit - // eviction asynchronously; the caller (NeedToWaitForClose) signals it separately. - if (addingPage) - return false; - // Otherwise, we need at least MinResizeTargetPageCount to be able to evict anything. return (TotalSize > highTargetSize) && numPages > MinResizeTargetPageCount; } @@ -222,7 +214,7 @@ public void IncrementSize(long size) heapSize.Increment(size); if (size > 0 && IsBeyondSizeLimitAndCanEvict()) resizeTaskEvent.Set(); - Debug.Assert(size > 0 || heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} after IncrementSize({size})"); + Debug.Assert(size > 0 || heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} in Resize"); } } @@ -402,10 +394,9 @@ private void ResizeIfNeeded(CancellationToken cancellationToken) // ShiftHeadAddress caps the new HeadAddress at FlushedUntilAddress. Wait until the SHA eviction is complete to avoid going further over budget. logAccessor.ShiftAddresses(readOnlyAddress, headAddress, waitForEviction: true); - // Heap size subtraction is handled by the OnNext eviction callback (called during ShiftAddresses → OnPagesClosed → MemoryPageScan), - // which subtracts each record's CURRENT HeapMemorySize at eviction time. This avoids double-counting with concurrent CopyUpdate - // operations that may have already cleared the source record and decremented heapSize via AddHeapSize(sizeAdjustment). - // Note: heapTrimmedSize from DetermineEvictionRange is used only for the eviction range calculation, not for the actual subtraction. + // Heap size subtraction is handled by the OnNext eviction callback (called during ShiftAddresses), + // which subtracts each record's CURRENT HeapMemorySize at eviction time. + Debug.Assert(heapSize.Total >= 0, $"HeapSize.Total should be >= 0 but is {heapSize.Total} in Resize"); // Calculate the number of trimmed pages and report the new expected AllocatedPageCount here, since our last iteration (which may have been the only one) // would have returned isComplete and thus we didn't wait for the actual eviction. From 80b33a925c2827adcc39df09286d29bc30245d5a Mon Sep 17 00:00:00 2001 From: Badrish Chandramouli Date: Sat, 4 Apr 2026 13:31:47 -0700 Subject: [PATCH 6/6] Fix ShiftAddressesWithWait to wait on ClosedUntilAddress instead of SafeHeadAddress SafeHeadAddress is set at the start of OnPagesClosed, before the eviction observer's OnNext callback runs and before ClosedUntilAddress advances. Waiting on SafeHeadAddress allowed ShiftAddresses(waitForEviction: true) to return before eviction callbacks completed, causing stale heap size accounting since the fix now relies on the OnNext callback to decrement heapSize rather than doing it manually. ClosedUntilAddress is updated after the eviction observer fires, so waiting on it ensures eviction callbacks have fully completed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Tsavorite/cs/src/core/Allocator/AllocatorBase.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs index 2a47f547a68..337824cb652 100644 --- a/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs +++ b/libs/storage/Tsavorite/cs/src/core/Allocator/AllocatorBase.cs @@ -990,7 +990,7 @@ internal void ShiftReadOnlyAddressWithWait(long newReadOnlyAddress, bool wait) /// /// New ReadOnlyAddress /// New HeadAddress - /// Wait for operation to complete (may involve page flushing and closing) + /// Wait for eviction to complete, i.e., until ClosedUntilAddress catches up (may involve page flushing, closing, and eviction callbacks) public void ShiftAddressesWithWait(long newReadOnlyAddress, long newHeadAddress, bool waitForEviction) { Debug.Assert(newHeadAddress <= newReadOnlyAddress, $"new HeadAddress {newHeadAddress} must not be ahead of newReadOnlyAddress {newReadOnlyAddress}"); @@ -1010,14 +1010,14 @@ public void ShiftAddressesWithWait(long newReadOnlyAddress, long newHeadAddress, epoch.Suspend(); } - while (waitForEviction && SafeHeadAddress < newHeadAddress) + while (waitForEviction && ClosedUntilAddress < newHeadAddress) _ = Thread.Yield(); return; } // Epoch already protected, so launch the shift and wait for eviction to complete _ = ShiftHeadAddress(newHeadAddress); - while (waitForEviction && SafeHeadAddress < newHeadAddress) + while (waitForEviction && ClosedUntilAddress < newHeadAddress) epoch.ProtectAndDrain(); }