From aade4845ab3e4c250b055a85ce0ba699ba4c7241 Mon Sep 17 00:00:00 2001 From: Josef Ezra Date: Thu, 6 Nov 2025 17:50:38 -0500 Subject: [PATCH 1/5] IndexingMerger: lessen work and retry by default Catching an exception, the IndexingMerger attempts to detect if it should lessen work and retry. Currently it is an include list of known "retyable" errors. If the error is not detected correctly, or not predicted, it will not retry. Being a background process, however, it is better and safer to retry by default. Resolves #3731 --- .../provider/foundationdb/IndexingBase.java | 18 ------------------ .../provider/foundationdb/IndexingMerger.java | 8 +++++++- .../foundationdb/IndexingThrottle.java | 19 ++++++++++++++++++- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingBase.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingBase.java index 5095076d57..b23228f837 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingBase.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingBase.java @@ -20,7 +20,6 @@ package com.apple.foundationdb.record.provider.foundationdb; -import com.apple.foundationdb.FDBError; import com.apple.foundationdb.FDBException; import com.apple.foundationdb.MutationType; import com.apple.foundationdb.annotation.API; @@ -59,7 +58,6 @@ import java.time.DateTimeException; import java.time.Instant; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -1213,22 +1211,6 @@ protected static T findException(@Nullable Throwable ex, Class classT) { } return null; } - - protected static boolean shouldLessenWork(@Nullable FDBException ex) { - // These error codes represent a list of errors that can occur if there is too much work to be done - // in a single transaction. - if (ex == null) { - return false; - } - final Set lessenWorkCodes = new HashSet<>(Arrays.asList( - FDBError.TIMED_OUT.code(), - FDBError.TRANSACTION_TOO_OLD.code(), - FDBError.NOT_COMMITTED.code(), - FDBError.TRANSACTION_TIMED_OUT.code(), - FDBError.COMMIT_READ_INCOMPLETE.code(), - FDBError.TRANSACTION_TOO_LARGE.code())); - return lessenWorkCodes.contains(ex.getCode()); - } } diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java index b3a4cb6dc6..f4cc82fe6a 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java @@ -150,7 +150,7 @@ private CompletableFuture handleFailure(final IndexDeferredMaintenanceC mergeControl.mergeHadFailed(); // report to adjust stats final FDBException ex = IndexingBase.findException(e, FDBException.class); final IndexDeferredMaintenanceControl.LastStep lastStep = mergeControl.getLastStep(); - if (!IndexingBase.shouldLessenWork(ex)) { + if (shouldAbort(ex)) { giveUpMerging(mergeControl, e); } switch (lastStep) { @@ -175,6 +175,12 @@ private CompletableFuture handleFailure(final IndexDeferredMaintenanceC return AsyncUtil.READY_TRUE; // and retry } + @SuppressWarnings("unused") + private boolean shouldAbort(@Nullable FDBException ex) { + // TODO: return true if the exception is clearly not retriable + return false; + } + private void handleRepartitioningFailure(final IndexDeferredMaintenanceControl mergeControl, Throwable e) { repartitionDocumentCount = mergeControl.getRepartitionDocumentCount(); if (repartitionDocumentCount == -1) { diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingThrottle.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingThrottle.java index c9e76ee1ef..eecb6ca7db 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingThrottle.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingThrottle.java @@ -20,6 +20,7 @@ package com.apple.foundationdb.record.provider.foundationdb; +import com.apple.foundationdb.FDBError; import com.apple.foundationdb.FDBException; import com.apple.foundationdb.annotation.API; import com.apple.foundationdb.async.AsyncUtil; @@ -133,7 +134,7 @@ boolean mayRetryAfterHandlingException(@Nullable FDBException fdbException, @Nullable List additionalLogMessageKeyValues, int currTries, final boolean adjustLimits) { - if (currTries >= common.config.getMaxRetries() || !IndexingBase.shouldLessenWork(fdbException)) { + if (currTries >= common.config.getMaxRetries() || !shouldLessenWork(fdbException)) { // Here: should not retry or no more retries. There is no real need to handle limits. return false; } @@ -144,6 +145,22 @@ boolean mayRetryAfterHandlingException(@Nullable FDBException fdbException, return true; } + private static boolean shouldLessenWork(@Nullable FDBException ex) { + // These error codes represent a list of errors that can occur if there is too much work to be done + // in a single transaction. + if (ex == null) { + return false; + } + final Set lessenWorkCodes = new HashSet<>(Arrays.asList( + FDBError.TIMED_OUT.code(), + FDBError.TRANSACTION_TOO_OLD.code(), + FDBError.NOT_COMMITTED.code(), + FDBError.TRANSACTION_TIMED_OUT.code(), + FDBError.COMMIT_READ_INCOMPLETE.code(), + FDBError.TRANSACTION_TOO_LARGE.code())); + return lessenWorkCodes.contains(ex.getCode()); + } + void decreaseLimit(@Nonnull FDBException fdbException, @Nullable List additionalLogMessageKeyValues) { // TODO: decrease the limit only for certain errors From 06ff45155c4f2a2c840a576de6213c5a13b4d537 Mon Sep 17 00:00:00 2001 From: Josef Ezra Date: Thu, 6 Nov 2025 18:31:59 -0500 Subject: [PATCH 2/5] Backward compatible - abort if the exception is not an FDBException --- .../record/provider/foundationdb/IndexingMerger.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java index f4cc82fe6a..e08910e2b5 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java @@ -175,8 +175,10 @@ private CompletableFuture handleFailure(final IndexDeferredMaintenanceC return AsyncUtil.READY_TRUE; // and retry } - @SuppressWarnings("unused") private boolean shouldAbort(@Nullable FDBException ex) { + if (ex == null) { + return true; + } // TODO: return true if the exception is clearly not retriable return false; } From 168a3121f5cb1bc465e530ec4fcdbd49f9eb4845 Mon Sep 17 00:00:00 2001 From: Josef Ezra Date: Mon, 10 Nov 2025 18:27:10 -0500 Subject: [PATCH 3/5] Tmp kludge for testing --- .../provider/foundationdb/IndexingMerger.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java index e08910e2b5..ae873586e0 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java @@ -148,9 +148,8 @@ private CompletableFuture handleFailure(final IndexDeferredMaintenanceC // merges. Not perfect, but as long as it's rare the impact should be minimal. mergeControl.mergeHadFailed(); // report to adjust stats - final FDBException ex = IndexingBase.findException(e, FDBException.class); final IndexDeferredMaintenanceControl.LastStep lastStep = mergeControl.getLastStep(); - if (shouldAbort(ex)) { + if (shouldAbort(e)) { giveUpMerging(mergeControl, e); } switch (lastStep) { @@ -175,7 +174,17 @@ private CompletableFuture handleFailure(final IndexDeferredMaintenanceC return AsyncUtil.READY_TRUE; // and retry } - private boolean shouldAbort(@Nullable FDBException ex) { + private boolean shouldAbort(@Nullable Throwable e) { + if (e == null) { + return true; + } + // Expecting AsyncToSyncTimeoutException or an instance of TimeoutException. However, cannot + // refer to AsyncToSyncTimeoutException without creating a lucene dependency + // TODO: remove this kludge + if (e.getClass().getCanonicalName().contains("Timeout") ) { + return false; + } + final FDBException ex = IndexingBase.findException(e, FDBException.class); if (ex == null) { return true; } From 7633735553bfa5e91216a74b50a30da3445c346b Mon Sep 17 00:00:00 2001 From: Josef Ezra Date: Mon, 1 Dec 2025 18:55:27 -0500 Subject: [PATCH 4/5] Add tests --- .../provider/foundationdb/IndexingMerger.java | 19 +- .../foundationdb/OnlineIndexerMergeTest.java | 194 +++++++++++++++++- 2 files changed, 207 insertions(+), 6 deletions(-) diff --git a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java index ae873586e0..f3a44af067 100644 --- a/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java +++ b/fdb-record-layer-core/src/main/java/com/apple/foundationdb/record/provider/foundationdb/IndexingMerger.java @@ -34,8 +34,10 @@ import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; import java.util.Collections; +import java.util.IdentityHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -181,15 +183,22 @@ private boolean shouldAbort(@Nullable Throwable e) { // Expecting AsyncToSyncTimeoutException or an instance of TimeoutException. However, cannot // refer to AsyncToSyncTimeoutException without creating a lucene dependency // TODO: remove this kludge - if (e.getClass().getCanonicalName().contains("Timeout") ) { - return false; + Set seenSet = Collections.newSetFromMap(new IdentityHashMap<>()); + for (Throwable t = e; + t != null && !seenSet.contains(t); + t = t.getCause()) { + if (t.getClass().getCanonicalName().contains("Timeout")) { + return false; + } + seenSet.add(t); } + final FDBException ex = IndexingBase.findException(e, FDBException.class); - if (ex == null) { - return true; + if (ex != null) { + return false; } // TODO: return true if the exception is clearly not retriable - return false; + return true; } private void handleRepartitioningFailure(final IndexDeferredMaintenanceControl mergeControl, Throwable e) { diff --git a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/OnlineIndexerMergeTest.java b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/OnlineIndexerMergeTest.java index 4aded5dcc7..7c1a14e0f5 100644 --- a/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/OnlineIndexerMergeTest.java +++ b/fdb-record-layer-core/src/test/java/com/apple/foundationdb/record/provider/foundationdb/OnlineIndexerMergeTest.java @@ -44,12 +44,14 @@ import com.apple.foundationdb.record.provider.foundationdb.properties.RecordLayerPropertyStorage; import com.apple.foundationdb.record.query.QueryToKeyMatcher; import com.apple.foundationdb.tuple.Tuple; +import com.apple.test.BooleanSource; import com.apple.test.Tags; import com.google.auto.service.AutoService; import com.google.protobuf.Message; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -58,6 +60,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; @@ -67,6 +70,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; /** * Tests specifically of {@link OnlineIndexer#mergeIndex()}. @@ -197,6 +201,186 @@ void testMergeTimeout() { assertEquals(repeat(0, mergeLimits.size()), repartitionLimits); } + /** + * Test that a RuntimeException (non-FDB, non-timeout) causes the merger to abort immediately. + */ + @Test + void testNonRetriableExceptionAborts() { + final String indexType = "nonRetriableExceptionIndex"; + AtomicInteger attemptCount = new AtomicInteger(0); + + TestFactory.register(indexType, state -> { + adjustMergeControl(state); + attemptCount.incrementAndGet(); + + // Throw a RuntimeException that is not retriable + final CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new IllegalStateException("Non-retriable error")); + return future; + }); + + final FDBRecordStore.Builder storeBuilder = createStore(indexType); + try (OnlineIndexer indexer = OnlineIndexer.newBuilder() + .setRecordStoreBuilder(storeBuilder) + .setTargetIndexesByName(List.of(INDEX_NAME)) + .setMaxAttempts(10) + .build()) { + Assertions.assertThrows(IllegalStateException.class, indexer::mergeIndex); + } + + // Should only attempt once, no retries + assertEquals(1, attemptCount.get()); + } + + /** + * Test that a TimeoutException causes retry behavior (not abort). + */ + @ParameterizedTest + @BooleanSource + void testTimeoutExceptionRetries(boolean customTimeout) { + final String indexType = "timeoutExceptionIndex"; + final String exceptionMessage = "my timeout"; + AtomicInteger attemptCount = new AtomicInteger(0); + + TestFactory.register(indexType, state -> { + adjustMergeControl(state); + + attemptCount.incrementAndGet(); + + // Throw a TimeoutException (not FDB timeout) + final CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally( + customTimeout ? + new CustomOperationTimeoutException(exceptionMessage) : + new java.util.concurrent.TimeoutException(exceptionMessage)); + return future; + }); + + final FDBRecordStore.Builder storeBuilder = createStore(indexType); + try (OnlineIndexer indexer = OnlineIndexer.newBuilder() + .setRecordStoreBuilder(storeBuilder) + .setTargetIndexesByName(List.of(INDEX_NAME)) + .setMaxAttempts(5) + .build()) { + Exception thrownException = Assertions.assertThrows(Exception.class, indexer::mergeIndex); + // Assert that the timeout exception is in the cause chain + var timeoutCause = + customTimeout ? + findCause(thrownException, CustomOperationTimeoutException.class) : + findCause(thrownException, TimeoutException.class); + Assertions.assertNotNull(timeoutCause); + Assertions.assertEquals(exceptionMessage, timeoutCause.getMessage()); + } + // Assert multiple retries + assertTrue(1 < attemptCount.get()); + } + + private static void adjustMergeControl(final IndexMaintainerState state) { + final IndexDeferredMaintenanceControl mergeControl = state.store.getIndexDeferredMaintenanceControl(); + mergeControl.setLastStep(IndexDeferredMaintenanceControl.LastStep.MERGE); + if (mergeControl.getMergesLimit() == 0) { + mergeControl.setMergesTried(10); + mergeControl.setMergesFound(10); + } else { + mergeControl.setMergesTried(mergeControl.getMergesLimit()); + mergeControl.setMergesFound(10); + } + } + + private static class CustomOperationTimeoutException extends RuntimeException { + private static final long serialVersionUID = -7034897190745766777L; + + public CustomOperationTimeoutException(String message) { + super(message); + } + } + + private static T findCause(Throwable throwable, Class causeType) { + Throwable current = throwable; + while (current != null) { + if (causeType.isInstance(current)) { + return causeType.cast(current); + } + current = current.getCause(); + } + return null; + } + + /** + * Test that an FDBException wrapped in another exception still triggers retry behavior. + */ + @Test + void testWrappedFDBExceptionRetries() { + final String indexType = "wrappedFdbExceptionIndex"; + AtomicInteger attemptCount = new AtomicInteger(0); + + TestFactory.register(indexType, state -> { + adjustMergeControl(state); + + attemptCount.incrementAndGet(); + + // Wrap FDBException in multiple layers to test deep unwrapping + final CompletableFuture future = new CompletableFuture<>(); + FDBException fdbEx = new FDBException("Transaction too old", FDBError.TRANSACTION_TOO_OLD.code()); + RuntimeException wrapper = new RuntimeException("Wrapper level 1", + new IllegalStateException("Wrapper level 2", fdbEx)); + future.completeExceptionally(wrapper); + return future; + }); + + final FDBRecordStore.Builder storeBuilder = createStore(indexType); + try (OnlineIndexer indexer = OnlineIndexer.newBuilder() + .setRecordStoreBuilder(storeBuilder) + .setTargetIndexesByName(List.of(INDEX_NAME)) + .setMaxAttempts(5) + .build()) { + + Exception thrownException = Assertions.assertThrows(Exception.class, indexer::mergeIndex); + + // Assert that FDBException is in the cause chain + FDBException fdbCause = findCause(thrownException, FDBException.class); + Assertions.assertNotNull(fdbCause); + assertEquals(FDBError.TRANSACTION_TOO_OLD.code(), fdbCause.getCode()); + } + + // Assert multiple retries + assertTrue(1 < attemptCount.get()); + } + + /** + * Test that a non-retriable exception during REPARTITION phase causes immediate abort. + */ + @Test + void testNonRetriableExceptionDuringRepartitionAborts() { + final String indexType = "nonRetriableRepartitionIndex"; + AtomicInteger attemptCount = new AtomicInteger(0); + + TestFactory.register(indexType, state -> { + final IndexDeferredMaintenanceControl mergeControl = state.store.getIndexDeferredMaintenanceControl(); + mergeControl.setLastStep(IndexDeferredMaintenanceControl.LastStep.REPARTITION); + mergeControl.setRepartitionDocumentCount(20); + + attemptCount.incrementAndGet(); + + // Throw a non-retriable exception during repartition + final CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new NullPointerException("Unexpected null during repartition")); + return future; + }); + + final FDBRecordStore.Builder storeBuilder = createStore(indexType); + try (OnlineIndexer indexer = OnlineIndexer.newBuilder() + .setRecordStoreBuilder(storeBuilder) + .setTargetIndexesByName(List.of(INDEX_NAME)) + .setMaxAttempts(10) + .build()) { + Assertions.assertThrows(NullPointerException.class, indexer::mergeIndex); + } + + // Should only attempt once, no retries + assertEquals(1, attemptCount.get()); + } + @Nonnull private FDBRecordStore.Builder createStore(@Nonnull final String indexType) { Index index = new Index(INDEX_NAME, Key.Expressions.field("num_value_2"), @@ -333,7 +517,15 @@ public CompletableFuture mergeIndex() { @Nonnull @Override public Iterable getIndexTypes() { - return List.of("repartitionTimeoutIndex", "mergeTimeoutIndex", "mergeLimitedIndex"); + return List.of( + "repartitionTimeoutIndex", + "mergeLimitedIndex", + "nonRetriableExceptionIndex", + "wrappedFdbExceptionIndex", + "mergeTimeoutIndex", + "timeoutExceptionIndex", + "nonRetriableRepartitionIndex" + ); } @Nonnull From 7b25720db6fc7c8f0352cdfbb76a7770dad1cfc5 Mon Sep 17 00:00:00 2001 From: Josef Ezra Date: Tue, 2 Dec 2025 11:10:51 -0500 Subject: [PATCH 5/5] Fix a failing test --- .../foundationdb/record/lucene/FDBLuceneIndexFailureTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneIndexFailureTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneIndexFailureTest.java index cf20e7ade8..2fbe10a946 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneIndexFailureTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/FDBLuceneIndexFailureTest.java @@ -192,10 +192,11 @@ void repartitionGroupedTestWithExceptionMapping(boolean useLegacyAsyncToSync, bo } // run partitioning without failure - make sure the index is still in good shape + timer.reset(); explicitMergeIndex(index, contextProps, schemaSetup, false, 0); try (FDBRecordContext context = openContext(contextProps)) { schemaSetup.accept(context); - assertEquals(2, getCounter(context, LuceneEvents.Counts.LUCENE_REPARTITION_CALLS).getCount()); + assertEquals(1, getCounter(context, LuceneEvents.Counts.LUCENE_REPARTITION_CALLS).getCount()); } partitionInfos = getPartitionMeta(index, groupingKey, contextProps, schemaSetup); // It should first move 6 from the most-recent to a new, older partition, then move 6 again into a partition