From a1e9f0bc630fa4cf1bdc0609de45fdfa25954b1b Mon Sep 17 00:00:00 2001 From: Kenny Gutierrez Date: Wed, 27 May 2026 11:24:50 -0500 Subject: [PATCH] Page the move-redirects job so it survives large tables The "Register content move redirects" job loaded the entire NotFoundHandler.ContentUrlHistory table in one unbounded query (GetAllMoved -> .ToList()). On large tables that SELECT exceeds the SqlClient command timeout, and SqlDataExecutor.ExecuteQuery swallowed the exception then returned ds.Tables[0] on an empty DataSet, surfacing a misleading "Cannot find table 0" instead of the real timeout. - SqlDataExecutor: apply a configurable CommandTimeout and rethrow query failures instead of masking them as "Cannot find table 0". - NotFoundHandlerOptions.CommandTimeout (default 30s). - IContentUrlHistoryLoader/SqlContentUrlHistoryRepository: add a paged GetAllMoved(skip, take) using OFFSET/FETCH over the moved keys, and make the parameterless overload stream pages so the unbounded query is gone. - RegisterMovedContentRedirectsJob: process the table page-by-page, tunable via OptimizelyNotFoundHandlerOptions.MovedContentBatchSize (default 1000). - Tests for repository paging and job batching; README updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 10 ++- .../IContentUrlHistoryLoader.cs | 10 +++ .../RegisterMovedContentRedirectsJob.cs | 67 +++++++++++------ .../Data/SqlContentUrlHistoryRepository.cs | 37 ++++++++- .../OptimizelyNotFoundHandlerOptions.cs | 6 ++ .../Data/SqlDataExecutor.cs | 11 ++- .../Configuration/NotFoundHandlerOptions.cs | 6 ++ .../RegisterMovedContentRedirectsJobTests.cs | 72 ++++++++++++++++++ .../SqlContentUrlHistoryRepositoryTests.cs | 75 +++++++++++++++++++ 9 files changed, 265 insertions(+), 29 deletions(-) create mode 100644 tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/RegisterMovedContentRedirectsJobTests.cs create mode 100644 tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/SqlContentUrlHistoryRepositoryTests.cs diff --git a/README.md b/README.md index b51d2002..863c297f 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,8 @@ If the `bufferSize` is set to `0`, the `threshold` value will be ignored, and ev **LogWithHostname**: Set to `true` to include hostname in the log. Useful in a multisite environment with several hostnames/domains. Default is `false` +**CommandTimeout**: Command timeout, in seconds, applied to SQL queries issued by the NotFound handler. Raise it past the SqlClient default of 30 if you have very large NotFound handler tables. Default is 30 + **ActiveStatusCodes**: A integerlist with the status codes that NotFoundHandler will be active on. (Ex. ```options.ActiveStatusCodes = new int[] { StatusCodes.Status404NotFound, StatusCodes.Status410Gone };```) ### Specifying ignored resources @@ -320,7 +322,13 @@ An Optimizely scheduled job was added - [Geta NotFoundHandler] Suggestions Additionally, there are two optimizely scheduled jobs responsible for: - *[Geta NotFoundHandler] Index content URLs* - as mentioned before, this job indexes URLs of content. Usually, it is required to run this job only once. All new content is automatically indexed. But if for some reason content publish events are not firing when creating new content (for example, during the import), then you should set this job to run frequently. -- *[Geta NotFoundHandler] Register content move redirects* - this job creates redirects based on registered moved content. Normally, this job is not required at all, but there might be situations when content move is registered but redirect creation is not completed. This could happen during deployments. In this case, you can manually run this job or schedule it to run time to time to fix such issues. +- *[Geta NotFoundHandler] Register content move redirects* - this job creates redirects based on registered moved content. Normally, this job is not required at all, but there might be situations when content move is registered but redirect creation is not completed. This could happen during deployments. In this case, you can manually run this job or schedule it to run time to time to fix such issues. The job processes the `NotFoundHandler.ContentUrlHistory` table page-by-page so it scales to large tables. You can tune the page size via `MovedContentBatchSize` (default 1000): +``` +services.AddOptimizelyNotFoundHandler(o => +{ + o.MovedContentBatchSize = 1000; +}); +``` # Troubleshooting diff --git a/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/IContentUrlHistoryLoader.cs b/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/IContentUrlHistoryLoader.cs index 912ab048..534feb9e 100644 --- a/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/IContentUrlHistoryLoader.cs +++ b/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/IContentUrlHistoryLoader.cs @@ -9,6 +9,16 @@ public interface IContentUrlHistoryLoader { bool IsRegistered(ContentUrlHistory entity); IEnumerable<(string contentKey, IReadOnlyCollection histories)> GetAllMoved(); + + /// + /// Returns a single page of moved content (keys with more than one URL-history entry), + /// ordered by content key. Lets callers stream large tables instead of loading every row + /// in one unbounded query. + /// + /// Number of moved content keys to skip. + /// Maximum number of moved content keys to return. + IEnumerable<(string contentKey, IReadOnlyCollection histories)> GetAllMoved(int skip, int take); + IReadOnlyCollection GetMoved(string contentKey); } } diff --git a/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/RegisterMovedContentRedirectsJob.cs b/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/RegisterMovedContentRedirectsJob.cs index 5d69dda0..871b38cb 100644 --- a/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/RegisterMovedContentRedirectsJob.cs +++ b/src/Geta.NotFoundHandler.Optimizely/Core/AutomaticRedirects/RegisterMovedContentRedirectsJob.cs @@ -6,6 +6,8 @@ using EPiServer.PlugIn; using EPiServer.Scheduler; using Geta.NotFoundHandler.Optimizely.Infrastructure; +using Geta.NotFoundHandler.Optimizely.Infrastructure.Configuration; +using Microsoft.Extensions.Options; namespace Geta.NotFoundHandler.Optimizely.Core.AutomaticRedirects { @@ -17,14 +19,17 @@ public class RegisterMovedContentRedirectsJob : ScheduledJobBase private readonly IContentUrlHistoryLoader _contentUrlHistoryLoader; private readonly JobStatusLogger _jobStatusLogger; private readonly IAutomaticRedirectsService _automaticRedirectsService; + private readonly int _batchSize; private bool _stopped; public RegisterMovedContentRedirectsJob( IAutomaticRedirectsService automaticRedirectsService, - IContentUrlHistoryLoader contentUrlHistoryLoader) + IContentUrlHistoryLoader contentUrlHistoryLoader, + IOptions options) { _automaticRedirectsService = automaticRedirectsService; _contentUrlHistoryLoader = contentUrlHistoryLoader; + _batchSize = options.Value.MovedContentBatchSize; _jobStatusLogger = new JobStatusLogger(OnStatusChanged); IsStoppable = true; @@ -32,45 +37,59 @@ public RegisterMovedContentRedirectsJob( public override string Execute() { - var movedContent = _contentUrlHistoryLoader.GetAllMoved().ToList(); - var totalCount = movedContent.Count; var successCount = 0; var failedCount = 0; var currentCount = 0; + var skip = 0; - _jobStatusLogger.LogWithStatus($"In total will process moved content: {totalCount}"); + _jobStatusLogger.LogWithStatus($"Processing moved content in batches of {_batchSize}"); - foreach (var content in movedContent) + // Page through the moved content rather than materialising the whole table up front. + // CreateRedirects only touches the redirects store, not ContentUrlHistory, so the moved + // set is stable for the duration of the run and skip-based paging never skips a key. + while (true) { - if (_stopped) + var batch = _contentUrlHistoryLoader.GetAllMoved(skip, _batchSize).ToList(); + + foreach (var content in batch) { - _jobStatusLogger.Log( - $"Job was stopped, successful content handled before stopped: {successCount} out of total {totalCount} content"); - return _jobStatusLogger.ToString(); - } + if (_stopped) + { + _jobStatusLogger.Log( + $"Job was stopped, successful content handled before stopped: {successCount} out of {currentCount} processed"); + return _jobStatusLogger.ToString(); + } - currentCount++; + currentCount++; - try - { - _automaticRedirectsService.CreateRedirects(content.histories); - successCount++; - } - catch (Exception ex) - { - _jobStatusLogger.Log($"Processing [{content.contentKey}] failed, exception: {ex}"); - failedCount++; + try + { + _automaticRedirectsService.CreateRedirects(content.histories); + successCount++; + } + catch (Exception ex) + { + _jobStatusLogger.Log($"Processing [{content.contentKey}] failed, exception: {ex}"); + failedCount++; + } + + if (currentCount % 500 == 0) + { + _jobStatusLogger.Status( + $"Processed {currentCount}, of whom successful {successCount}; failed: {failedCount}"); + } } - if (currentCount % 500 == 0) + if (batch.Count < _batchSize) { - _jobStatusLogger.Status( - $"Processed {currentCount} of whom successful {successCount} out of total {totalCount} content; failed: {failedCount}"); + break; } + + skip += _batchSize; } _jobStatusLogger.Log( - $"Processed {currentCount} of whom successful {successCount} out of total {totalCount} content; failed: {failedCount}"); + $"Processed {currentCount}, of whom successful {successCount}; failed: {failedCount}"); return _jobStatusLogger.ToString(); } diff --git a/src/Geta.NotFoundHandler.Optimizely/Data/SqlContentUrlHistoryRepository.cs b/src/Geta.NotFoundHandler.Optimizely/Data/SqlContentUrlHistoryRepository.cs index 5905597c..78b4c6cc 100644 --- a/src/Geta.NotFoundHandler.Optimizely/Data/SqlContentUrlHistoryRepository.cs +++ b/src/Geta.NotFoundHandler.Optimizely/Data/SqlContentUrlHistoryRepository.cs @@ -18,6 +18,7 @@ public class SqlContentUrlHistoryRepository : IRepository, IC { private const string ContentUrlHistoryTable = "[dbo].[NotFoundHandler.ContentUrlHistory]"; private const string AllFields = "Id, ContentKey, Urls, CreatedUtc, md5_ContentKey"; + private const int DefaultPageSize = 1000; private readonly IDataExecutor _dataExecutor; public SqlContentUrlHistoryRepository(IDataExecutor dataExecutor) @@ -69,17 +70,47 @@ public bool IsRegistered(ContentUrlHistory entity) public IEnumerable<(string contentKey, IReadOnlyCollection histories)> GetAllMoved() { + // Stream the table one page at a time rather than issuing a single unbounded query that + // grows with the table and can exceed the command timeout on large sites. + var skip = 0; + while (true) + { + var page = GetAllMoved(skip, DefaultPageSize).ToList(); + + foreach (var moved in page) + { + yield return moved; + } + + if (page.Count < DefaultPageSize) + { + yield break; + } + + skip += DefaultPageSize; + } + } + + public IEnumerable<(string contentKey, IReadOnlyCollection histories)> GetAllMoved(int skip, int take) + { + // Page over the moved content keys (md5_ContentKey is the hash of ContentKey, so each key + // maps to a single group and is never split across pages) and return their histories. var sqlCommand = $@"SELECT h.Id, h.ContentKey, h.Urls, h.CreatedUtc, h.md5_ContentKey FROM {ContentUrlHistoryTable} h - INNER JOIN + INNER JOIN (SELECT ContentKey, md5_ContentKey FROM {ContentUrlHistoryTable} GROUP BY ContentKey, md5_ContentKey - HAVING COUNT(*) > 1) k + HAVING COUNT(*) > 1 + ORDER BY ContentKey, md5_ContentKey + OFFSET @skip ROWS FETCH NEXT @take ROWS ONLY) k ON h.ContentKey = k.ContentKey AND h.md5_ContentKey = k.md5_ContentKey ORDER BY h.ContentKey, h.CreatedUtc DESC"; - var dataTable = _dataExecutor.ExecuteQuery(sqlCommand); + var dataTable = _dataExecutor.ExecuteQuery( + sqlCommand, + _dataExecutor.CreateIntParameter("skip", skip), + _dataExecutor.CreateIntParameter("take", take)); var histories = ToContentUrlHistory(dataTable); diff --git a/src/Geta.NotFoundHandler.Optimizely/Infrastructure/Configuration/OptimizelyNotFoundHandlerOptions.cs b/src/Geta.NotFoundHandler.Optimizely/Infrastructure/Configuration/OptimizelyNotFoundHandlerOptions.cs index be865c1a..efa949ee 100644 --- a/src/Geta.NotFoundHandler.Optimizely/Infrastructure/Configuration/OptimizelyNotFoundHandlerOptions.cs +++ b/src/Geta.NotFoundHandler.Optimizely/Infrastructure/Configuration/OptimizelyNotFoundHandlerOptions.cs @@ -16,6 +16,12 @@ public class OptimizelyNotFoundHandlerOptions public bool AutomaticRedirectsEnabled { get; set; } public RedirectType AutomaticRedirectType { get; set; } = RedirectType.Temporary; + /// + /// Number of moved content keys the "Register content move redirects" job loads per page. + /// The job processes the table page-by-page so it scales to large NotFoundHandler tables. + /// + public int MovedContentBatchSize { get; set; } = 1000; + private readonly List _contentKeyProviders = new(); public IEnumerable ContentKeyProviders => _contentKeyProviders; diff --git a/src/Geta.NotFoundHandler/Data/SqlDataExecutor.cs b/src/Geta.NotFoundHandler/Data/SqlDataExecutor.cs index c2f5be91..30123996 100644 --- a/src/Geta.NotFoundHandler/Data/SqlDataExecutor.cs +++ b/src/Geta.NotFoundHandler/Data/SqlDataExecutor.cs @@ -16,12 +16,14 @@ public class SqlDataExecutor : IDataExecutor { private readonly ILogger _logger; private readonly string _connectionString; + private readonly int _commandTimeout; public SqlDataExecutor( IOptions options, ILogger logger) { _connectionString = options.Value.ConnectionString; + _commandTimeout = options.Value.CommandTimeout; _logger = logger; } @@ -42,6 +44,12 @@ public DataTable ExecuteQuery(string sqlCommand, params IDbDataParameter[] param _logger.LogError(ex, "An error occurred in the ExecuteSQL method with the following sql: {SqlCommand}", sqlCommand); + + // Previously the exception was swallowed here and execution fell through to + // 'return ds.Tables[0]'. On a failed Fill the DataSet has no tables, so that line + // threw IndexOutOfRangeException ("Cannot find table 0"), masking the real cause + // (e.g. a command timeout). Rethrow so callers see the actual failure. + throw; } return ds.Tables[0]; @@ -199,10 +207,11 @@ private static SqlParameter CreateReturnParameter() return parameter; } - private static SqlCommand CreateCommand(SqlConnection connection, string sqlCommand, params IDbDataParameter[] parameters) + private SqlCommand CreateCommand(SqlConnection connection, string sqlCommand, params IDbDataParameter[] parameters) { var command = connection.CreateCommand(); command.CommandText = sqlCommand; + command.CommandTimeout = _commandTimeout; if (parameters != null) { diff --git a/src/Geta.NotFoundHandler/Infrastructure/Configuration/NotFoundHandlerOptions.cs b/src/Geta.NotFoundHandler/Infrastructure/Configuration/NotFoundHandlerOptions.cs index 6838358f..5fcb93c0 100644 --- a/src/Geta.NotFoundHandler/Infrastructure/Configuration/NotFoundHandlerOptions.cs +++ b/src/Geta.NotFoundHandler/Infrastructure/Configuration/NotFoundHandlerOptions.cs @@ -17,6 +17,12 @@ public class NotFoundHandlerOptions public int BufferSize { get; set; } = 30; public int ThreshHold { get; set; } = 5; + + /// + /// Command timeout, in seconds, applied to SQL queries issued by . + /// Exposed so sites with large NotFoundHandler tables can raise it past the SqlClient default of 30s. + /// + public int CommandTimeout { get; set; } = 30; public SuggestionsCleanupOptions SuggestionsCleanupOptions { get; set; } = new(); public bool UseInternalScheduler { get; set; } public string InternalSchedulerCronInterval { get; set; } = "0 0 * * *"; diff --git a/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/RegisterMovedContentRedirectsJobTests.cs b/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/RegisterMovedContentRedirectsJobTests.cs new file mode 100644 index 00000000..3de2b25d --- /dev/null +++ b/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/RegisterMovedContentRedirectsJobTests.cs @@ -0,0 +1,72 @@ +using System.Collections.Generic; +using System.Linq; +using FakeItEasy; +using Geta.NotFoundHandler.Optimizely.Core.AutomaticRedirects; +using Geta.NotFoundHandler.Optimizely.Infrastructure.Configuration; +using Microsoft.Extensions.Options; +using Xunit; + +namespace Geta.NotFoundHandler.Optimizely.Tests.AutomaticRedirects; + +public class RegisterMovedContentRedirectsJobTests +{ + private readonly IAutomaticRedirectsService _redirectsService = A.Fake(); + private readonly IContentUrlHistoryLoader _loader = A.Fake(); + + [Fact] + public void Execute_processes_every_page() + { + A.CallTo(() => _loader.GetAllMoved(0, 2)).Returns(Moved("a", "b")); + A.CallTo(() => _loader.GetAllMoved(2, 2)).Returns(Moved("c")); + var job = CreateJob(batchSize: 2); + + job.Execute(); + + A.CallTo(() => _redirectsService.CreateRedirects(A>._)) + .MustHaveHappened(3, Times.Exactly); + A.CallTo(() => _loader.GetAllMoved(0, 2)).MustHaveHappened(); + A.CallTo(() => _loader.GetAllMoved(2, 2)).MustHaveHappened(); + } + + [Fact] + public void Execute_does_not_request_next_page_after_a_partial_page() + { + A.CallTo(() => _loader.GetAllMoved(0, 5)).Returns(Moved("a", "b")); + var job = CreateJob(batchSize: 5); + + job.Execute(); + + A.CallTo(() => _loader.GetAllMoved(5, 5)).MustNotHaveHappened(); + } + + [Fact] + public void Execute_continues_when_a_single_item_fails() + { + A.CallTo(() => _loader.GetAllMoved(0, 10)).Returns(Moved("good", "bad", "good")); + A.CallTo(() => _redirectsService.CreateRedirects(A>.That.Matches( + x => x.Count == 1 && x.First().ContentKey == "bad"))) + .Throws(new System.InvalidOperationException("boom")); + var job = CreateJob(batchSize: 10); + + var result = job.Execute(); + + // All three are attempted even though the middle one throws. + A.CallTo(() => _redirectsService.CreateRedirects(A>._)) + .MustHaveHappened(3, Times.Exactly); + Assert.Contains("failed", result); + } + + private RegisterMovedContentRedirectsJob CreateJob(int batchSize) + { + var options = Options.Create(new OptimizelyNotFoundHandlerOptions { MovedContentBatchSize = batchSize }); + return new RegisterMovedContentRedirectsJob(_redirectsService, _loader, options); + } + + private static IEnumerable<(string contentKey, IReadOnlyCollection histories)> Moved(params string[] keys) + { + foreach (var key in keys) + { + yield return (key, new List { new() { ContentKey = key } }); + } + } +} diff --git a/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/SqlContentUrlHistoryRepositoryTests.cs b/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/SqlContentUrlHistoryRepositoryTests.cs new file mode 100644 index 00000000..d9f187ef --- /dev/null +++ b/tests/Geta.NotFoundHandler.Optimizely.Tests/AutomaticRedirects/SqlContentUrlHistoryRepositoryTests.cs @@ -0,0 +1,75 @@ +using System; +using System.Data; +using System.Linq; +using FakeItEasy; +using Geta.NotFoundHandler.Data; +using Geta.NotFoundHandler.Optimizely.Data; +using Xunit; + +namespace Geta.NotFoundHandler.Optimizely.Tests.AutomaticRedirects; + +public class SqlContentUrlHistoryRepositoryTests +{ + private readonly IDataExecutor _dataExecutor = A.Fake(); + private readonly SqlContentUrlHistoryRepository _repository; + + public SqlContentUrlHistoryRepositoryTests() + { + _repository = new SqlContentUrlHistoryRepository(_dataExecutor); + } + + [Fact] + public void GetAllMoved_paged_passes_skip_and_take() + { + A.CallTo(() => _dataExecutor.ExecuteQuery(A._, A._)).Returns(EmptyTable()); + + _ = _repository.GetAllMoved(40, 20).ToList(); + + A.CallTo(() => _dataExecutor.CreateIntParameter("skip", 40)).MustHaveHappened(); + A.CallTo(() => _dataExecutor.CreateIntParameter("take", 20)).MustHaveHappened(); + } + + [Fact] + public void GetAllMoved_paged_groups_histories_by_content_key() + { + var table = EmptyTable(); + AddRow(table, "key-a"); + AddRow(table, "key-a"); + AddRow(table, "key-b"); + A.CallTo(() => _dataExecutor.ExecuteQuery(A._, A._)).Returns(table); + + var result = _repository.GetAllMoved(0, 1000).ToList(); + + Assert.Equal(2, result.Count); + Assert.Equal(2, result.Single(x => x.contentKey == "key-a").histories.Count); + Assert.Equal(1, result.Single(x => x.contentKey == "key-b").histories.Count); + } + + [Fact] + public void GetAllMoved_stops_paging_when_page_is_not_full() + { + var shortPage = EmptyTable(); + AddRow(shortPage, "only-key"); + A.CallTo(() => _dataExecutor.ExecuteQuery(A._, A._)).Returns(shortPage); + + _ = _repository.GetAllMoved().ToList(); + + // The first page returns fewer rows than the page size, so no further query is issued. + A.CallTo(() => _dataExecutor.ExecuteQuery(A._, A._)).MustHaveHappenedOnceExactly(); + } + + private static DataTable EmptyTable() + { + var table = new DataTable(); + table.Columns.Add("Id", typeof(Guid)); + table.Columns.Add("ContentKey", typeof(string)); + table.Columns.Add("Urls", typeof(string)); + table.Columns.Add("CreatedUtc", typeof(DateTime)); + return table; + } + + private static void AddRow(DataTable table, string contentKey) + { + table.Rows.Add(Guid.NewGuid(), contentKey, string.Empty, DateTime.UtcNow); + } +}