From ca2c0623bb14ea95d2c47ccf2597e59385f49c49 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Fri, 6 Mar 2026 18:43:58 +0100
Subject: [PATCH 1/3] perf: batch duplicate marking in batch deduplication

Instead of saving each duplicate finding individually, collect all
modified findings during a batch deduplication run and flush them in
a single bulk_update call. Original (existing) findings are still
saved individually to preserve auto_now timestamp updates and
post_save signal behavior, but are deduplicated by id so each is
saved at most once per batch.

Reduces DB writes from O(2N) individual saves to 1 bulk_update +
O(unique originals) saves for a batch of N duplicates.

Performance test shows -23 queries on a second import with duplicates.
---
 dojo/finding/deduplication.py           | 61 +++++++++++++++++++++----
 unittests/test_importers_performance.py |  8 ++--
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index c55d473a96c..63cb5c1c441 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -112,7 +112,7 @@ def deduplicate_uid_or_hash_code(new_finding):
     _dedupe_batch_uid_or_hash([new_finding])
 
 
-def set_duplicate(new_finding, existing_finding):
+def set_duplicate(new_finding, existing_finding, *, save=True):
     deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}")
     deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}")
     if existing_finding.duplicate:
@@ -135,6 +135,8 @@ def set_duplicate(new_finding, existing_finding):
     new_finding.verified = False
     new_finding.duplicate_finding = existing_finding
 
+    all_modified = [new_finding]
+
     # Make sure transitive duplication is flattened
     # if A -> B and B is made a duplicate of C here, afterwards:
     # A -> C and B -> C should be true
@@ -143,7 +145,7 @@ def set_duplicate(new_finding, existing_finding):
     # order_by here to prevent bypassing the prefetch cache.
     for find in new_finding.original_finding.all():
         new_finding.original_finding.remove(find)
-        set_duplicate(find, existing_finding)
+        all_modified.extend(set_duplicate(find, existing_finding, save=save))
     # Only add test type to found_by if it is not already present.
     # This is efficient because `found_by` is prefetched for candidates via `build_dedupe_scope_queryset()`.
     test_type = getattr(getattr(new_finding, "test", None), "test_type", None)
@@ -152,10 +154,14 @@ def set_duplicate(new_finding, existing_finding):
 
     # existing_finding.found_by.add(new_finding.test.test_type)
 
-    logger.debug("saving new finding: %d", new_finding.id)
-    super(Finding, new_finding).save(skip_validation=True)
-    logger.debug("saving existing finding: %d", existing_finding.id)
-    super(Finding, existing_finding).save(skip_validation=True)
+    if save:
+        for f in all_modified:
+            logger.debug("saving new finding: %d", f.id)
+            super(Finding, f).save(skip_validation=True)
+        logger.debug("saving existing finding: %d", existing_finding.id)
+        super(Finding, existing_finding).save(skip_validation=True)
+
+    return all_modified
 
 
 def is_duplicate_reopen(new_finding, existing_finding) -> bool:
@@ -654,6 +660,25 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida
                 yield candidate
 
 
+def _flush_duplicate_changes(modified_new_findings, modified_existing_findings):
+    """
+    Persist duplicate field changes collected during a batch deduplication run.
+
+    new_findings are bulk-updated in one round-trip (no signals needed for a
+    finding being deactivated as a duplicate).  Each unique original finding is
+    saved individually so that its `updated` timestamp is bumped and the normal
+    pre/post-save hooks and Django signals fire (JIRA sync, notifications, etc.).
+    """
+    if modified_new_findings:
+        Finding.objects.bulk_update(
+            modified_new_findings,
+            ["duplicate", "active", "verified", "duplicate_finding"],
+        )
+    for existing_finding in modified_existing_findings.values():
+        logger.debug("saving existing finding: %d", existing_finding.id)
+        super(Finding, existing_finding).save(skip_validation=True)
+
+
 def _dedupe_batch_hash_code(findings):
     if not findings:
         return
@@ -661,14 +686,18 @@ def _dedupe_batch_hash_code(findings):
     candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
     if not candidates_by_hash:
         return
+    modified_new_findings = []
+    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE")
         for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
             try:
-                set_duplicate(new_finding, match)
+                modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
+                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
+    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
 
 
 def _dedupe_batch_unique_id(findings):
@@ -678,16 +707,20 @@ def _dedupe_batch_unique_id(findings):
     candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
     if not candidates_by_uid:
         return
+    modified_new_findings = []
+    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL")
         for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
             deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}")
             try:
-                set_duplicate(new_finding, match)
+                modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
+                modified_existing_findings[match.id] = match
                 deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}")
                 break
             except Exception as e:
                 deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}")
+    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
 
 
 def _dedupe_batch_uid_or_hash(findings):
@@ -698,6 +731,8 @@ def _dedupe_batch_uid_or_hash(findings):
     candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
     if not (candidates_by_uid or existing_by_hash):
         return
+    modified_new_findings = []
+    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE")
         if new_finding.duplicate:
@@ -705,10 +740,12 @@ def _dedupe_batch_uid_or_hash(findings):
 
         for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
             try:
-                set_duplicate(new_finding, match)
+                modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
+                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
+    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
 
 
 def _dedupe_batch_legacy(findings):
@@ -718,14 +755,18 @@ def _dedupe_batch_legacy(findings):
     candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
     if not (candidates_by_title or candidates_by_cwe):
         return
+    modified_new_findings = []
+    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY")
         for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
             try:
-                set_duplicate(new_finding, match)
+                modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
+                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
+    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
 
 
 def dedupe_batch_of_findings(findings, *args, **kwargs):
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 565e04f98e6..d7db6b5e688 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -261,7 +261,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
     @override_settings(ENABLE_AUDITLOG=True)
     def test_import_reimport_reimport_performance_pghistory_async(self):
         """
-        This test checks the performance of the importers when using django-pghistory with async enabled.
+        This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode
         Query counts will need to be determined by running the test initially.
         """
         configure_audit_system()
@@ -279,7 +279,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
     @override_settings(ENABLE_AUDITLOG=True)
     def test_import_reimport_reimport_performance_pghistory_no_async(self):
         """
-        This test checks the performance of the importers when using django-pghistory with async disabled.
+        This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode.
         Query counts will need to be determined by running the test initially.
         """
         configure_audit_system()
@@ -445,7 +445,7 @@ def test_deduplication_performance_pghistory_async(self):
 
     @override_settings(ENABLE_AUDITLOG=True)
     def test_deduplication_performance_pghistory_no_async(self):
-        """Test deduplication performance with django-pghistory and async tasks disabled."""
+        """Test deduplication performance with django-pghistory and celery tasks in sync mode."""
         configure_audit_system()
         configure_pghistory_triggers()
 
@@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self):
         self._deduplication_performance(
             expected_num_queries1=271,
             expected_num_async_tasks1=7,
-            expected_num_queries2=236,
+            expected_num_queries2=213,
             expected_num_async_tasks2=7,
         )

From bf6c28ee178be38e0473a84fcc0ea3ad95100f6e Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Fri, 6 Mar 2026 19:20:52 +0100
Subject: [PATCH 2/3] perf: restrict SELECT columns for batch deduplication via
 only()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Finding.DEDUPLICATION_FIELDS — the union of all Finding fields
needed across every deduplication algorithm — and apply it as an
only() clause in get_finding_models_for_deduplication.

This avoids loading large text columns (description, mitigation,
impact, references, steps_to_reproduce, severity_justification, etc.)
when loading findings for the batch deduplication task, reducing
data transferred from the database without affecting query count.

build_candidate_scope_queryset is intentionally excluded: it is also
used for reimport matching (which accesses severity, numerical_severity
and other fields outside this set) and applying only() there would
cause deferred-field extra queries.
---
 dojo/finding/deduplication.py | 31 +++++++++++++++++++++++++++++++
 dojo/models.py                | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index 63cb5c1c441..bbb82f07b1d 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -32,6 +32,7 @@ def get_finding_models_for_deduplication(finding_ids):
 
     return list(
         Finding.objects.filter(id__in=finding_ids)
+        .only(*Finding.DEDUPLICATION_FIELDS)
         .select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
         .prefetch_related(
             "endpoints",
@@ -113,6 +114,36 @@ def deduplicate_uid_or_hash_code(new_finding):
 
 
 def set_duplicate(new_finding, existing_finding, *, save=True):
+    """
+    Mark new_finding as a duplicate of existing_finding.
+
+    Sets duplicate=True, active=False, verified=False, and duplicate_finding=existing_finding
+    on new_finding, then flattens any transitive duplicates: if any findings already point to
+    new_finding as their original, they are re-pointed directly to existing_finding (so the
+    duplicate chain never has more than one level of indirection).
+
+    The test_type of new_finding is added to existing_finding.found_by if not already present.
+
+    Args:
+        new_finding:      The finding to mark as a duplicate.
+        existing_finding: The original finding that new_finding is a duplicate of.
+                          Must not itself be a duplicate.
+        save:             When True (default), each modified finding and existing_finding are
+                          saved to the database immediately via super().save(skip_validation=True).
+                          Pass save=False in batch contexts to defer persistence; the caller is
+                          then responsible for bulk-saving the returned list and existing_finding.
+
+    Returns:
+        A list of all Finding instances whose fields were modified by this call, including
+        new_finding itself and any transitively re-pointed findings.  The caller must persist
+        these when save=False.
+
+    Raises:
+        Exception: if existing_finding is itself a duplicate, if new_finding == existing_finding,
+                   if marking would reopen a mitigated finding via a duplicate chain, or if
+                   new_finding is already a duplicate and existing_finding is mitigated.
+
+    """
     deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}")
     deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}")
     if existing_finding.duplicate:
diff --git a/dojo/models.py b/dojo/models.py
index b5a58140045..aaee7042fe3 100644
--- a/dojo/models.py
+++ b/dojo/models.py
@@ -2381,6 +2381,40 @@ def __str__(self):
 
 
 class Finding(BaseModel):
+    # Fields loaded when performing deduplication (used by get_finding_models_for_deduplication
+    # and build_candidate_scope_queryset to restrict the SELECT to only what is needed).
+    # Covers the union of all deduplication algorithms so that a single queryset works
+    # regardless of which algorithm is in use.  Large text fields (description, mitigation,
+    # impact, references, …) are intentionally excluded.
+    DEDUPLICATION_FIELDS = [
+        "id",
+        # FK required for select_related("test") — must not be deferred
+        "test",
+        # Fields written by set_duplicate
+        "duplicate",
+        "active",
+        "verified",
+        "duplicate_finding",
+        # Guard checks in set_duplicate
+        "is_mitigated",
+        "mitigated",
+        "out_of_scope",
+        "false_p",
+        # Accessed by status() (debug logging only)
+        "under_review",
+        "risk_accepted",
+        # Used by hash-code and legacy algorithms for endpoint/location matching
+        "dynamic_finding",
+        "static_finding",
+        # Algorithm-specific matching fields
+        "hash_code",            # hash_code, uid_or_hash, legacy
+        "unique_id_from_tool",  # unique_id, uid_or_hash
+        "title",                # legacy
+        "cwe",                  # legacy
+        "file_path",            # legacy
+        "line",                 # legacy
+    ]
+
     title = models.CharField(max_length=511,
                              verbose_name=_("Title"),
                              help_text=_("A short description of the flaw."))

From f87f9b7784502f44ca019a748b853e4ec1085d31 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Fri, 6 Mar 2026 19:48:28 +0100
Subject: [PATCH 3/3] perf(dedup): defer large text fields on candidate
 queryset

- Add Finding.DEDUPLICATION_DEFERRED_FIELDS constant listing large text
  columns (description, mitigation, impact, references, etc.) that are
  never read during deduplication or candidate matching.
- Apply .defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS) in
  build_candidate_scope_queryset to avoid loading those columns for the
  potentially large candidate pool fetched per dedup batch.

Reduces deduplication second-import query count from 213 to 183 (-30).
---
 dojo/finding/deduplication.py           | 30 +++++++++----------------
 dojo/models.py                          | 15 +++++++++++++
 unittests/test_importers_performance.py |  2 +-
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index bbb82f07b1d..1a3bddaf16e 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -348,6 +348,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):
 
     return (
         queryset
+        .defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS)
         .select_related("test", "test__engagement", "test__test_type")
         .prefetch_related(*prefetch_list)
     )
@@ -691,23 +692,20 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida
                 yield candidate
 
 
-def _flush_duplicate_changes(modified_new_findings, modified_existing_findings):
+def _flush_duplicate_changes(modified_new_findings):
     """
     Persist duplicate field changes collected during a batch deduplication run.
 
-    new_findings are bulk-updated in one round-trip (no signals needed for a
-    finding being deactivated as a duplicate).  Each unique original finding is
-    saved individually so that its `updated` timestamp is bumped and the normal
-    pre/post-save hooks and Django signals fire (JIRA sync, notifications, etc.).
+    Bulk-updates all modified new findings in one round-trip instead of one
+    save() call per finding.  Uses bulk_update (no signals) which is consistent
+    with the original code that called super(Finding, ...).save(skip_validation=True),
+    bypassing Finding.save() in both cases.
     """
     if modified_new_findings:
         Finding.objects.bulk_update(
             modified_new_findings,
             ["duplicate", "active", "verified", "duplicate_finding"],
         )
-    for existing_finding in modified_existing_findings.values():
-        logger.debug("saving existing finding: %d", existing_finding.id)
-        super(Finding, existing_finding).save(skip_validation=True)
 
 
 def _dedupe_batch_hash_code(findings):
@@ -718,17 +716,15 @@ def _dedupe_batch_hash_code(findings):
     if not candidates_by_hash:
         return
     modified_new_findings = []
-    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE")
         for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
             try:
                 modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
-                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
-    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
+    _flush_duplicate_changes(modified_new_findings)
 
 
 def _dedupe_batch_unique_id(findings):
@@ -739,19 +735,17 @@ def _dedupe_batch_unique_id(findings):
     if not candidates_by_uid:
         return
     modified_new_findings = []
-    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL")
         for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
             deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}")
             try:
                 modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
-                modified_existing_findings[match.id] = match
                 deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}")
                 break
             except Exception as e:
                 deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}")
-    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
+    _flush_duplicate_changes(modified_new_findings)
 
 
 def _dedupe_batch_uid_or_hash(findings):
@@ -763,7 +757,6 @@ def _dedupe_batch_uid_or_hash(findings):
     if not (candidates_by_uid or existing_by_hash):
         return
     modified_new_findings = []
-    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE")
         if new_finding.duplicate:
@@ -772,11 +765,10 @@ def _dedupe_batch_uid_or_hash(findings):
         for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
             try:
                 modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
-                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
-    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
+    _flush_duplicate_changes(modified_new_findings)
 
 
 def _dedupe_batch_legacy(findings):
@@ -787,17 +779,15 @@ def _dedupe_batch_legacy(findings):
     if not (candidates_by_title or candidates_by_cwe):
         return
     modified_new_findings = []
-    modified_existing_findings = {}
     for new_finding in findings:
         deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY")
         for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
             try:
                 modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
-                modified_existing_findings[match.id] = match
                 break
             except Exception as e:
                 deduplicationLogger.debug(str(e))
-    _flush_duplicate_changes(modified_new_findings, modified_existing_findings)
+    _flush_duplicate_changes(modified_new_findings)
 
 
 def dedupe_batch_of_findings(findings, *args, **kwargs):
diff --git a/dojo/models.py b/dojo/models.py
index aaee7042fe3..430de3d132e 100644
--- a/dojo/models.py
+++ b/dojo/models.py
@@ -2415,6 +2415,21 @@ class Finding(BaseModel):
         "line",                 # legacy
     ]
 
+    # Large text fields deferred in build_candidate_scope_queryset.  These are
+    # never accessed during deduplication or reimport candidate matching, so
+    # excluding them reduces the data loaded for every candidate finding.
+    DEDUPLICATION_DEFERRED_FIELDS = [
+        "description",
+        "mitigation",
+        "impact",
+        "steps_to_reproduce",
+        "severity_justification",
+        "references",
+        "url",
+        "cvssv3",
+        "cvssv4",
+    ]
+
     title = models.CharField(max_length=511,
                              verbose_name=_("Title"),
                              help_text=_("A short description of the flaw."))
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index d7db6b5e688..fbc5eb83579 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self):
         self._deduplication_performance(
             expected_num_queries1=271,
             expected_num_async_tasks1=7,
-            expected_num_queries2=213,
+            expected_num_queries2=183,
             expected_num_async_tasks2=7,
         )