diff --git a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj index b3e7e2fb9..a86ad09ac 100644 --- a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj +++ b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj @@ -11,7 +11,7 @@ 0 - 2 + 3 0 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs index 2c41bce57..a5dd568b2 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs @@ -170,6 +170,14 @@ public class AzureStorageOrchestrationServiceSettings /// public bool AllowReplayingTerminalInstances { get; set; } = false; + /// + /// Specifies the timeout (in seconds) for read and write operations on the partition table in partition manager V3 (table partition manager). + /// This helps detect and recover from potential silent hangs caused by Azure Storage client's internal retries. + /// If the operation exceeds the timeout, a PartitionManagerWarning is logged and the operation is retried. + /// The default time is 2 seconds. + /// + public TimeSpan PartitionTableOperationTimeout { get; set; } = TimeSpan.FromSeconds(2); + /// /// If UseAppLease is true, gets or sets the AppLeaseOptions used for acquiring the lease to start the application. /// diff --git a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj index a37429800..97fe26d24 100644 --- a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj +++ b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj @@ -21,8 +21,8 @@ 2 - 0 - 2 + 1 + 0 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs index 2685c9be2..88665fa86 100644 --- a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs +++ b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs @@ -118,7 +118,10 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell try { - ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, forcefulShutdownToken); + using var timeoutCts = new CancellationTokenSource(this.settings.PartitionTableOperationTimeout); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(forcefulShutdownToken, timeoutCts.Token); + + ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, linkedCts.Token); // If shutdown is requested and already released all ownership leases, then break the loop. if (isShuttingDown && response.ReleasedAllLeases) @@ -147,6 +150,20 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell { consecutiveFailureCount++; } + // ReadAndWriteTableAsync exceeded the set timeout. + // This may indicate a transient storage or network issue. + // The operation will be retried immediately unless it fails more than 10 consecutive times. + catch (OperationCanceledException) when (!forcefulShutdownToken.IsCancellationRequested) + { + this.settings.Logger.PartitionManagerWarning( + this.storageAccountName, + this.settings.TaskHubName, + this.settings.WorkerId, + partitionId: NotApplicable, + details: "Operation to read and write the partition table exceeded the 2-second timeout."); + + consecutiveFailureCount++; + } // Eat any unexpected exceptions. catch (Exception exception) {