From 150a45f9b7b99bd65df649a30dbe6e319ce55de2 Mon Sep 17 00:00:00 2001 From: "naiyuantian@microsoft.com" Date: Thu, 10 Apr 2025 11:52:58 -0700 Subject: [PATCH 1/2] initial commit --- .../DurableTask.ApplicationInsights.csproj | 2 +- .../DurableTask.AzureStorage.csproj | 4 ++-- .../Partitioning/TablePartitionManager.cs | 19 ++++++++++++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj index b3e7e2fb9..a86ad09ac 100644 --- a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj +++ b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj @@ -11,7 +11,7 @@ 0 - 2 + 3 0 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj index a37429800..97fe26d24 100644 --- a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj +++ b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj @@ -21,8 +21,8 @@ 2 - 0 - 2 + 1 + 0 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs index 2685c9be2..e64580361 100644 --- a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs +++ b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs @@ -118,7 +118,10 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell try { - ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, forcefulShutdownToken); + using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(2)); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(forcefulShutdownToken, timeoutCts.Token); + + ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, linkedCts.Token); // If shutdown is requested and already released all ownership leases, then break the loop. if (isShuttingDown && response.ReleasedAllLeases) @@ -147,6 +150,20 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell { consecutiveFailureCount++; } + // ReadAndWriteTableAsync exceeded the 2-second timeout. + // This may indicate a transient storage or network issue. + // The operation will be retried immediately unless it fails more than 10 consecutive times. + catch (OperationCanceledException) when (!forcefulShutdownToken.IsCancellationRequested) + { + this.settings.Logger.PartitionManagerWarning( + this.storageAccountName, + this.settings.TaskHubName, + this.settings.WorkerId, + partitionId: NotApplicable, + details: "Operation to read and write the partition table exceeded the 2-second timeout."); + + consecutiveFailureCount++; + } // Eat any unexpected exceptions. catch (Exception exception) { From 478f171fba2d648bd35727b3d13dd20e52616f7b Mon Sep 17 00:00:00 2001 From: "naiyuantian@microsoft.com" Date: Fri, 11 Apr 2025 10:39:56 -0700 Subject: [PATCH 2/2] make this timeout configurable --- .../AzureStorageOrchestrationServiceSettings.cs | 8 ++++++++ .../Partitioning/TablePartitionManager.cs | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs index 2c41bce57..a5dd568b2 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs @@ -170,6 +170,14 @@ public class AzureStorageOrchestrationServiceSettings /// public bool AllowReplayingTerminalInstances { get; set; } = false; + /// + /// Specifies the timeout (in seconds) for read and write operations on the partition table in partition manager V3 (table partition manager). + /// This helps detect and recover from potential silent hangs caused by Azure Storage client's internal retries. + /// If the operation exceeds the timeout, a PartitionManagerWarning is logged and the operation is retried. + /// The default time is 2 seconds. + /// + public TimeSpan PartitionTableOperationTimeout { get; set; } = TimeSpan.FromSeconds(2); + /// /// If UseAppLease is true, gets or sets the AppLeaseOptions used for acquiring the lease to start the application. /// diff --git a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs index e64580361..88665fa86 100644 --- a/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs +++ b/src/DurableTask.AzureStorage/Partitioning/TablePartitionManager.cs @@ -118,7 +118,7 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell try { - using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(2)); + using var timeoutCts = new CancellationTokenSource(this.settings.PartitionTableOperationTimeout); using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(forcefulShutdownToken, timeoutCts.Token); ReadTableReponse response = await this.tableLeaseManager.ReadAndWriteTableAsync(isShuttingDown, linkedCts.Token); @@ -150,7 +150,7 @@ async Task PartitionManagerLoop(CancellationToken gracefulShutdownToken, Cancell { consecutiveFailureCount++; } - // ReadAndWriteTableAsync exceeded the 2-second timeout. + // ReadAndWriteTableAsync exceeded the set timeout. // This may indicate a transient storage or network issue. // The operation will be retried immediately unless it fails more than 10 consecutive times. catch (OperationCanceledException) when (!forcefulShutdownToken.IsCancellationRequested)