From ca2c0623bb14ea95d2c47ccf2597e59385f49c49 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 6 Mar 2026 18:43:58 +0100 Subject: [PATCH 1/3] perf: batch duplicate marking in batch deduplication Instead of saving each duplicate finding individually, collect all modified findings during a batch deduplication run and flush them in a single bulk_update call. Original (existing) findings are still saved individually to preserve auto_now timestamp updates and post_save signal behavior, but are deduplicated by id so each is saved at most once per batch. Reduces DB writes from O(2N) individual saves to 1 bulk_update + O(unique originals) saves for a batch of N duplicates. Performance test shows -23 queries on a second import with duplicates. --- dojo/finding/deduplication.py | 61 +++++++++++++++++++++---- unittests/test_importers_performance.py | 8 ++-- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index c55d473a96c..63cb5c1c441 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -112,7 +112,7 @@ def deduplicate_uid_or_hash_code(new_finding): _dedupe_batch_uid_or_hash([new_finding]) -def set_duplicate(new_finding, existing_finding): +def set_duplicate(new_finding, existing_finding, *, save=True): deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}") deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}") if existing_finding.duplicate: @@ -135,6 +135,8 @@ def set_duplicate(new_finding, existing_finding): new_finding.verified = False new_finding.duplicate_finding = existing_finding + all_modified = [new_finding] + # Make sure transitive duplication is flattened # if A -> B and B is made a duplicate of C here, afterwards: # A -> C and B -> C should be true @@ -143,7 +145,7 @@ def set_duplicate(new_finding, existing_finding): # order_by here to prevent bypassing the prefetch cache. for find in new_finding.original_finding.all(): new_finding.original_finding.remove(find) - set_duplicate(find, existing_finding) + all_modified.extend(set_duplicate(find, existing_finding, save=save)) # Only add test type to found_by if it is not already present. # This is efficient because `found_by` is prefetched for candidates via `build_dedupe_scope_queryset()`. test_type = getattr(getattr(new_finding, "test", None), "test_type", None) @@ -152,10 +154,14 @@ def set_duplicate(new_finding, existing_finding): # existing_finding.found_by.add(new_finding.test.test_type) - logger.debug("saving new finding: %d", new_finding.id) - super(Finding, new_finding).save(skip_validation=True) - logger.debug("saving existing finding: %d", existing_finding.id) - super(Finding, existing_finding).save(skip_validation=True) + if save: + for f in all_modified: + logger.debug("saving new finding: %d", f.id) + super(Finding, f).save(skip_validation=True) + logger.debug("saving existing finding: %d", existing_finding.id) + super(Finding, existing_finding).save(skip_validation=True) + + return all_modified def is_duplicate_reopen(new_finding, existing_finding) -> bool: @@ -654,6 +660,25 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida yield candidate +def _flush_duplicate_changes(modified_new_findings, modified_existing_findings): + """ + Persist duplicate field changes collected during a batch deduplication run. + + new_findings are bulk-updated in one round-trip (no signals needed for a + finding being deactivated as a duplicate). Each unique original finding is + saved individually so that its `updated` timestamp is bumped and the normal + pre/post-save hooks and Django signals fire (JIRA sync, notifications, etc.). + """ + if modified_new_findings: + Finding.objects.bulk_update( + modified_new_findings, + ["duplicate", "active", "verified", "duplicate_finding"], + ) + for existing_finding in modified_existing_findings.values(): + logger.debug("saving existing finding: %d", existing_finding.id) + super(Finding, existing_finding).save(skip_validation=True) + + def _dedupe_batch_hash_code(findings): if not findings: return @@ -661,14 +686,18 @@ def _dedupe_batch_hash_code(findings): candidates_by_hash = find_candidates_for_deduplication_hash(test, findings) if not candidates_by_hash: return + modified_new_findings = [] + modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE") for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash): try: - set_duplicate(new_finding, match) + modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) + modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) + _flush_duplicate_changes(modified_new_findings, modified_existing_findings) def _dedupe_batch_unique_id(findings): @@ -678,16 +707,20 @@ def _dedupe_batch_unique_id(findings): candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings) if not candidates_by_uid: return + modified_new_findings = [] + modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL") for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid): deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}") try: - set_duplicate(new_finding, match) + modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) + modified_existing_findings[match.id] = match deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}") break except Exception as e: deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}") + _flush_duplicate_changes(modified_new_findings, modified_existing_findings) def _dedupe_batch_uid_or_hash(findings): @@ -698,6 +731,8 @@ def _dedupe_batch_uid_or_hash(findings): candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings) if not (candidates_by_uid or existing_by_hash): return + modified_new_findings = [] + modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE") if new_finding.duplicate: @@ -705,10 +740,12 @@ def _dedupe_batch_uid_or_hash(findings): for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash): try: - set_duplicate(new_finding, match) + modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) + modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) + _flush_duplicate_changes(modified_new_findings, modified_existing_findings) def _dedupe_batch_legacy(findings): @@ -718,14 +755,18 @@ def _dedupe_batch_legacy(findings): candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings) if not (candidates_by_title or candidates_by_cwe): return + modified_new_findings = [] + modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY") for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe): try: - set_duplicate(new_finding, match) + modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) + modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) + _flush_duplicate_changes(modified_new_findings, modified_existing_findings) def dedupe_batch_of_findings(findings, *args, **kwargs): diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 565e04f98e6..d7db6b5e688 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -261,7 +261,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async @override_settings(ENABLE_AUDITLOG=True) def test_import_reimport_reimport_performance_pghistory_async(self): """ - This test checks the performance of the importers when using django-pghistory with async enabled. + This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode Query counts will need to be determined by running the test initially. """ configure_audit_system() @@ -279,7 +279,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self): @override_settings(ENABLE_AUDITLOG=True) def test_import_reimport_reimport_performance_pghistory_no_async(self): """ - This test checks the performance of the importers when using django-pghistory with async disabled. + This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode. Query counts will need to be determined by running the test initially. """ configure_audit_system() @@ -445,7 +445,7 @@ def test_deduplication_performance_pghistory_async(self): @override_settings(ENABLE_AUDITLOG=True) def test_deduplication_performance_pghistory_no_async(self): - """Test deduplication performance with django-pghistory and async tasks disabled.""" + """Test deduplication performance with django-pghistory and celery tasks in sync mode.""" configure_audit_system() configure_pghistory_triggers() @@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self): self._deduplication_performance( expected_num_queries1=271, expected_num_async_tasks1=7, - expected_num_queries2=236, + expected_num_queries2=213, expected_num_async_tasks2=7, ) From bf6c28ee178be38e0473a84fcc0ea3ad95100f6e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 6 Mar 2026 19:20:52 +0100 Subject: [PATCH 2/3] perf: restrict SELECT columns for batch deduplication via only() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Finding.DEDUPLICATION_FIELDS — the union of all Finding fields needed across every deduplication algorithm — and apply it as an only() clause in get_finding_models_for_deduplication. This avoids loading large text columns (description, mitigation, impact, references, steps_to_reproduce, severity_justification, etc.) when loading findings for the batch deduplication task, reducing data transferred from the database without affecting query count. build_candidate_scope_queryset is intentionally excluded: it is also used for reimport matching (which accesses severity, numerical_severity and other fields outside this set) and applying only() there would cause deferred-field extra queries. --- dojo/finding/deduplication.py | 31 +++++++++++++++++++++++++++++++ dojo/models.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 63cb5c1c441..bbb82f07b1d 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -32,6 +32,7 @@ def get_finding_models_for_deduplication(finding_ids): return list( Finding.objects.filter(id__in=finding_ids) + .only(*Finding.DEDUPLICATION_FIELDS) .select_related("test", "test__engagement", "test__engagement__product", "test__test_type") .prefetch_related( "endpoints", @@ -113,6 +114,36 @@ def deduplicate_uid_or_hash_code(new_finding): def set_duplicate(new_finding, existing_finding, *, save=True): + """ + Mark new_finding as a duplicate of existing_finding. + + Sets duplicate=True, active=False, verified=False, and duplicate_finding=existing_finding + on new_finding, then flattens any transitive duplicates: if any findings already point to + new_finding as their original, they are re-pointed directly to existing_finding (so the + duplicate chain never has more than one level of indirection). + + The test_type of new_finding is added to existing_finding.found_by if not already present. + + Args: + new_finding: The finding to mark as a duplicate. + existing_finding: The original finding that new_finding is a duplicate of. + Must not itself be a duplicate. + save: When True (default), each modified finding and existing_finding are + saved to the database immediately via super().save(skip_validation=True). + Pass save=False in batch contexts to defer persistence; the caller is + then responsible for bulk-saving the returned list and existing_finding. + + Returns: + A list of all Finding instances whose fields were modified by this call, including + new_finding itself and any transitively re-pointed findings. The caller must persist + these when save=False. + + Raises: + Exception: if existing_finding is itself a duplicate, if new_finding == existing_finding, + if marking would reopen a mitigated finding via a duplicate chain, or if + new_finding is already a duplicate and existing_finding is mitigated. + + """ deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}") deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}") if existing_finding.duplicate: diff --git a/dojo/models.py b/dojo/models.py index b5a58140045..aaee7042fe3 100644 --- a/dojo/models.py +++ b/dojo/models.py @@ -2381,6 +2381,40 @@ def __str__(self): class Finding(BaseModel): + # Fields loaded when performing deduplication (used by get_finding_models_for_deduplication + # and build_candidate_scope_queryset to restrict the SELECT to only what is needed). + # Covers the union of all deduplication algorithms so that a single queryset works + # regardless of which algorithm is in use. Large text fields (description, mitigation, + # impact, references, …) are intentionally excluded. + DEDUPLICATION_FIELDS = [ + "id", + # FK required for select_related("test") — must not be deferred + "test", + # Fields written by set_duplicate + "duplicate", + "active", + "verified", + "duplicate_finding", + # Guard checks in set_duplicate + "is_mitigated", + "mitigated", + "out_of_scope", + "false_p", + # Accessed by status() (debug logging only) + "under_review", + "risk_accepted", + # Used by hash-code and legacy algorithms for endpoint/location matching + "dynamic_finding", + "static_finding", + # Algorithm-specific matching fields + "hash_code", # hash_code, uid_or_hash, legacy + "unique_id_from_tool", # unique_id, uid_or_hash + "title", # legacy + "cwe", # legacy + "file_path", # legacy + "line", # legacy + ] + title = models.CharField(max_length=511, verbose_name=_("Title"), help_text=_("A short description of the flaw.")) From f87f9b7784502f44ca019a748b853e4ec1085d31 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 6 Mar 2026 19:48:28 +0100 Subject: [PATCH 3/3] perf(dedup): defer large text fields on candidate queryset - Add Finding.DEDUPLICATION_DEFERRED_FIELDS constant listing large text columns (description, mitigation, impact, references, etc.) that are never read during deduplication or candidate matching. - Apply .defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS) in build_candidate_scope_queryset to avoid loading those columns for the potentially large candidate pool fetched per dedup batch. Reduces deduplication second-import query count from 213 to 183 (-30). --- dojo/finding/deduplication.py | 30 +++++++++---------------- dojo/models.py | 15 +++++++++++++ unittests/test_importers_performance.py | 2 +- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index bbb82f07b1d..1a3bddaf16e 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -348,6 +348,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): return ( queryset + .defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS) .select_related("test", "test__engagement", "test__test_type") .prefetch_related(*prefetch_list) ) @@ -691,23 +692,20 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida yield candidate -def _flush_duplicate_changes(modified_new_findings, modified_existing_findings): +def _flush_duplicate_changes(modified_new_findings): """ Persist duplicate field changes collected during a batch deduplication run. - new_findings are bulk-updated in one round-trip (no signals needed for a - finding being deactivated as a duplicate). Each unique original finding is - saved individually so that its `updated` timestamp is bumped and the normal - pre/post-save hooks and Django signals fire (JIRA sync, notifications, etc.). + Bulk-updates all modified new findings in one round-trip instead of one + save() call per finding. Uses bulk_update (no signals) which is consistent + with the original code that called super(Finding, ...).save(skip_validation=True), + bypassing Finding.save() in both cases. """ if modified_new_findings: Finding.objects.bulk_update( modified_new_findings, ["duplicate", "active", "verified", "duplicate_finding"], ) - for existing_finding in modified_existing_findings.values(): - logger.debug("saving existing finding: %d", existing_finding.id) - super(Finding, existing_finding).save(skip_validation=True) def _dedupe_batch_hash_code(findings): @@ -718,17 +716,15 @@ def _dedupe_batch_hash_code(findings): if not candidates_by_hash: return modified_new_findings = [] - modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE") for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash): try: modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) - modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) - _flush_duplicate_changes(modified_new_findings, modified_existing_findings) + _flush_duplicate_changes(modified_new_findings) def _dedupe_batch_unique_id(findings): @@ -739,19 +735,17 @@ def _dedupe_batch_unique_id(findings): if not candidates_by_uid: return modified_new_findings = [] - modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL") for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid): deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}") try: modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) - modified_existing_findings[match.id] = match deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}") break except Exception as e: deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}") - _flush_duplicate_changes(modified_new_findings, modified_existing_findings) + _flush_duplicate_changes(modified_new_findings) def _dedupe_batch_uid_or_hash(findings): @@ -763,7 +757,6 @@ def _dedupe_batch_uid_or_hash(findings): if not (candidates_by_uid or existing_by_hash): return modified_new_findings = [] - modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE") if new_finding.duplicate: @@ -772,11 +765,10 @@ def _dedupe_batch_uid_or_hash(findings): for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash): try: modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) - modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) - _flush_duplicate_changes(modified_new_findings, modified_existing_findings) + _flush_duplicate_changes(modified_new_findings) def _dedupe_batch_legacy(findings): @@ -787,17 +779,15 @@ def _dedupe_batch_legacy(findings): if not (candidates_by_title or candidates_by_cwe): return modified_new_findings = [] - modified_existing_findings = {} for new_finding in findings: deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY") for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe): try: modified_new_findings.extend(set_duplicate(new_finding, match, save=False)) - modified_existing_findings[match.id] = match break except Exception as e: deduplicationLogger.debug(str(e)) - _flush_duplicate_changes(modified_new_findings, modified_existing_findings) + _flush_duplicate_changes(modified_new_findings) def dedupe_batch_of_findings(findings, *args, **kwargs): diff --git a/dojo/models.py b/dojo/models.py index aaee7042fe3..430de3d132e 100644 --- a/dojo/models.py +++ b/dojo/models.py @@ -2415,6 +2415,21 @@ class Finding(BaseModel): "line", # legacy ] + # Large text fields deferred in build_candidate_scope_queryset. These are + # never accessed during deduplication or reimport candidate matching, so + # excluding them reduces the data loaded for every candidate finding. + DEDUPLICATION_DEFERRED_FIELDS = [ + "description", + "mitigation", + "impact", + "steps_to_reproduce", + "severity_justification", + "references", + "url", + "cvssv3", + "cvssv4", + ] + title = models.CharField(max_length=511, verbose_name=_("Title"), help_text=_("A short description of the flaw.")) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index d7db6b5e688..fbc5eb83579 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self): self._deduplication_performance( expected_num_queries1=271, expected_num_async_tasks1=7, - expected_num_queries2=213, + expected_num_queries2=183, expected_num_async_tasks2=7, )