Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 72 additions & 10 deletions dojo/finding/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_finding_models_for_deduplication(finding_ids):

return list(
Finding.objects.filter(id__in=finding_ids)
.only(*Finding.DEDUPLICATION_FIELDS)
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
.prefetch_related(
"endpoints",
Expand Down Expand Up @@ -112,7 +113,37 @@ def deduplicate_uid_or_hash_code(new_finding):
_dedupe_batch_uid_or_hash([new_finding])


def set_duplicate(new_finding, existing_finding):
def set_duplicate(new_finding, existing_finding, *, save=True):
"""
Mark new_finding as a duplicate of existing_finding.

Sets duplicate=True, active=False, verified=False, and duplicate_finding=existing_finding
on new_finding, then flattens any transitive duplicates: if any findings already point to
new_finding as their original, they are re-pointed directly to existing_finding (so the
duplicate chain never has more than one level of indirection).

The test_type of new_finding is added to existing_finding.found_by if not already present.

Args:
new_finding: The finding to mark as a duplicate.
existing_finding: The original finding that new_finding is a duplicate of.
Must not itself be a duplicate.
save: When True (default), each modified finding and existing_finding are
saved to the database immediately via super().save(skip_validation=True).
Pass save=False in batch contexts to defer persistence; the caller is
then responsible for bulk-saving the returned list and existing_finding.

Returns:
A list of all Finding instances whose fields were modified by this call, including
new_finding itself and any transitively re-pointed findings. The caller must persist
these when save=False.

Raises:
Exception: if existing_finding is itself a duplicate, if new_finding == existing_finding,
if marking would reopen a mitigated finding via a duplicate chain, or if
new_finding is already a duplicate and existing_finding is mitigated.

"""
deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}")
deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}")
if existing_finding.duplicate:
Expand All @@ -135,6 +166,8 @@ def set_duplicate(new_finding, existing_finding):
new_finding.verified = False
new_finding.duplicate_finding = existing_finding

all_modified = [new_finding]

# Make sure transitive duplication is flattened
# if A -> B and B is made a duplicate of C here, afterwards:
# A -> C and B -> C should be true
Expand All @@ -143,7 +176,7 @@ def set_duplicate(new_finding, existing_finding):
# order_by here to prevent bypassing the prefetch cache.
for find in new_finding.original_finding.all():
new_finding.original_finding.remove(find)
set_duplicate(find, existing_finding)
all_modified.extend(set_duplicate(find, existing_finding, save=save))
# Only add test type to found_by if it is not already present.
# This is efficient because `found_by` is prefetched for candidates via `build_dedupe_scope_queryset()`.
test_type = getattr(getattr(new_finding, "test", None), "test_type", None)
Expand All @@ -152,10 +185,14 @@ def set_duplicate(new_finding, existing_finding):

# existing_finding.found_by.add(new_finding.test.test_type)

logger.debug("saving new finding: %d", new_finding.id)
super(Finding, new_finding).save(skip_validation=True)
logger.debug("saving existing finding: %d", existing_finding.id)
super(Finding, existing_finding).save(skip_validation=True)
if save:
for f in all_modified:
logger.debug("saving new finding: %d", f.id)
super(Finding, f).save(skip_validation=True)
logger.debug("saving existing finding: %d", existing_finding.id)
super(Finding, existing_finding).save(skip_validation=True)

return all_modified


def is_duplicate_reopen(new_finding, existing_finding) -> bool:
Expand Down Expand Up @@ -311,6 +348,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):

return (
queryset
.defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS)
.select_related("test", "test__engagement", "test__test_type")
.prefetch_related(*prefetch_list)
)
Expand Down Expand Up @@ -654,21 +692,39 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida
yield candidate


def _flush_duplicate_changes(modified_new_findings):
"""
Persist duplicate field changes collected during a batch deduplication run.

Bulk-updates all modified new findings in one round-trip instead of one
save() call per finding. Uses bulk_update (no signals) which is consistent
with the original code that called super(Finding, ...).save(skip_validation=True),
bypassing Finding.save() in both cases.
"""
if modified_new_findings:
Finding.objects.bulk_update(
modified_new_findings,
["duplicate", "active", "verified", "duplicate_finding"],
)


def _dedupe_batch_hash_code(findings):
if not findings:
return
test = findings[0].test
candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
if not candidates_by_hash:
return
modified_new_findings = []
for new_finding in findings:
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE")
for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
try:
set_duplicate(new_finding, match)
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
break
except Exception as e:
deduplicationLogger.debug(str(e))
_flush_duplicate_changes(modified_new_findings)


def _dedupe_batch_unique_id(findings):
Expand All @@ -678,16 +734,18 @@ def _dedupe_batch_unique_id(findings):
candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
if not candidates_by_uid:
return
modified_new_findings = []
for new_finding in findings:
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL")
for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}")
try:
set_duplicate(new_finding, match)
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}")
break
except Exception as e:
deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}")
_flush_duplicate_changes(modified_new_findings)


def _dedupe_batch_uid_or_hash(findings):
Expand All @@ -698,17 +756,19 @@ def _dedupe_batch_uid_or_hash(findings):
candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
if not (candidates_by_uid or existing_by_hash):
return
modified_new_findings = []
for new_finding in findings:
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE")
if new_finding.duplicate:
continue

for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
try:
set_duplicate(new_finding, match)
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
break
except Exception as e:
deduplicationLogger.debug(str(e))
_flush_duplicate_changes(modified_new_findings)


def _dedupe_batch_legacy(findings):
Expand All @@ -718,14 +778,16 @@ def _dedupe_batch_legacy(findings):
candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
if not (candidates_by_title or candidates_by_cwe):
return
modified_new_findings = []
for new_finding in findings:
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY")
for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
try:
set_duplicate(new_finding, match)
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
break
except Exception as e:
deduplicationLogger.debug(str(e))
_flush_duplicate_changes(modified_new_findings)


def dedupe_batch_of_findings(findings, *args, **kwargs):
Expand Down
49 changes: 49 additions & 0 deletions dojo/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2381,6 +2381,55 @@ def __str__(self):


class Finding(BaseModel):
# Fields loaded when performing deduplication (used by get_finding_models_for_deduplication
# and build_candidate_scope_queryset to restrict the SELECT to only what is needed).
# Covers the union of all deduplication algorithms so that a single queryset works
# regardless of which algorithm is in use. Large text fields (description, mitigation,
# impact, references, …) are intentionally excluded.
DEDUPLICATION_FIELDS = [
"id",
# FK required for select_related("test") — must not be deferred
"test",
# Fields written by set_duplicate
"duplicate",
"active",
"verified",
"duplicate_finding",
# Guard checks in set_duplicate
"is_mitigated",
"mitigated",
"out_of_scope",
"false_p",
# Accessed by status() (debug logging only)
"under_review",
"risk_accepted",
# Used by hash-code and legacy algorithms for endpoint/location matching
"dynamic_finding",
"static_finding",
# Algorithm-specific matching fields
"hash_code", # hash_code, uid_or_hash, legacy
"unique_id_from_tool", # unique_id, uid_or_hash
"title", # legacy
"cwe", # legacy
"file_path", # legacy
"line", # legacy
]

# Large text fields deferred in build_candidate_scope_queryset. These are
# never accessed during deduplication or reimport candidate matching, so
# excluding them reduces the data loaded for every candidate finding.
DEDUPLICATION_DEFERRED_FIELDS = [
"description",
"mitigation",
"impact",
"steps_to_reproduce",
"severity_justification",
"references",
"url",
"cvssv3",
"cvssv4",
]

title = models.CharField(max_length=511,
verbose_name=_("Title"),
help_text=_("A short description of the flaw."))
Expand Down
8 changes: 4 additions & 4 deletions unittests/test_importers_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
@override_settings(ENABLE_AUDITLOG=True)
def test_import_reimport_reimport_performance_pghistory_async(self):
"""
This test checks the performance of the importers when using django-pghistory with async enabled.
This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode
Query counts will need to be determined by running the test initially.
"""
configure_audit_system()
Expand All @@ -279,7 +279,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
@override_settings(ENABLE_AUDITLOG=True)
def test_import_reimport_reimport_performance_pghistory_no_async(self):
"""
This test checks the performance of the importers when using django-pghistory with async disabled.
This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode.
Query counts will need to be determined by running the test initially.
"""
configure_audit_system()
Expand Down Expand Up @@ -445,7 +445,7 @@ def test_deduplication_performance_pghistory_async(self):

@override_settings(ENABLE_AUDITLOG=True)
def test_deduplication_performance_pghistory_no_async(self):
"""Test deduplication performance with django-pghistory and async tasks disabled."""
"""Test deduplication performance with django-pghistory and celery tasks in sync mode."""
configure_audit_system()
configure_pghistory_triggers()

Expand All @@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self):
self._deduplication_performance(
expected_num_queries1=271,
expected_num_async_tasks1=7,
expected_num_queries2=236,
expected_num_queries2=183,
expected_num_async_tasks2=7,
)