From a837d2de7f0f3e667157ba2270d6e89a00e8213d Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Jun 2026 09:53:25 +0200 Subject: [PATCH 01/29] feat(importers): add async-wait import execution mode Introduce a third import/reimport post-processing execution mode and make it configurable per request and via the user profile. - async (default): dispatch post-processing to the background, respond immediately - async_wait (new): dispatch to the background but wait for deduplication to finish before responding, so scan_added notifications and the returned statistics reflect the deduplicated state - sync: run post-processing inline in the web process Replace the legacy UserContactInfo.block_execution boolean with an import_execution_mode CharField; a data migration maps block_execution=True to the 'sync' mode. wants_block_execution() now derives from the sync mode, preserving global foreground-execution semantics for all async tasks. Add a request field import_execution_mode (ImportScanSerializer/ ReImportScanSerializer) resolved against the profile (request override wins), and a deduplication_complete boolean in the import/reimport response. The post_process_findings_batch task now stores its result (ignore_result=False) so async_wait can join on it via AsyncResult.get(); the join is bounded by the new DD_IMPORT_ASYNC_WAIT_TIMEOUT setting. --- dojo/api_v2/serializers.py | 42 ++++- dojo/celery_dispatch.py | 7 +- ...9_usercontactinfo_import_execution_mode.py | 16 ++ ..._remove_usercontactinfo_block_execution.py | 28 ++++ dojo/decorators.py | 6 +- dojo/finding/helper.py | 8 +- dojo/fixtures/defect_dojo_sample_data.json | 3 - .../defect_dojo_sample_data_locations.json | 3 - dojo/fixtures/dojo_testdata.json | 3 - dojo/fixtures/dojo_testdata_locations.json | 3 - dojo/importers/base_importer.py | 75 +++++++++ dojo/importers/default_importer.py | 10 +- dojo/importers/default_reimporter.py | 10 +- dojo/importers/options.py | 23 +++ dojo/middleware.py | 4 +- dojo/models.py | 11 +- dojo/settings/settings.dist.py | 4 + dojo/templates/dojo/view_user.html | 8 +- dojo/templates_classic/dojo/view_user.html | 8 +- dojo/user/models.py | 59 ++++++- dojo/user/ui/forms.py | 2 +- tests/base_test_class.py | 26 ++-- unittests/test_async_delete.py | 8 +- unittests/test_cascade_delete.py | 2 +- unittests/test_celery_dispatch_force_async.py | 2 +- unittests/test_deduplication_logic.py | 2 +- unittests/test_duplication_loops.py | 2 +- .../test_false_positive_history_logic.py | 2 +- unittests/test_finding_helper.py | 2 +- unittests/test_finding_model.py | 2 +- unittests/test_import_execution_mode.py | 147 ++++++++++++++++++ unittests/test_import_reimport.py | 4 +- unittests/test_importers_deduplication.py | 2 +- unittests/test_importers_performance.py | 14 +- unittests/test_jira_config_engagement.py | 2 +- unittests/test_jira_config_engagement_epic.py | 2 +- unittests/test_jira_import_and_pushing_api.py | 2 +- unittests/test_notifications.py | 10 +- .../test_prepare_duplicates_for_delete.py | 2 +- unittests/test_product_grading.py | 2 +- unittests/test_reimport_prefetch.py | 2 +- unittests/test_tag_inheritance.py | 4 +- unittests/test_tags.py | 4 +- unittests/test_watson_async_search_index.py | 2 +- 44 files changed, 484 insertions(+), 96 deletions(-) create mode 100644 dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py create mode 100644 dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py create mode 100644 unittests/test_import_execution_mode.py diff --git a/dojo/api_v2/serializers.py b/dojo/api_v2/serializers.py index 667dc9c4348..a1d804a72d0 100644 --- a/dojo/api_v2/serializers.py +++ b/dojo/api_v2/serializers.py @@ -25,11 +25,13 @@ from dojo.location.models import Location from dojo.models import ( IMPORT_ACTIONS, + IMPORT_EXECUTION_MODE_CHOICES, SEVERITIES, SEVERITY_CHOICES, STATS_FIELDS, App_Analysis, Development_Environment, + Dojo_User, DojoMeta, Endpoint, Engagement, @@ -431,6 +433,16 @@ class CommonImportScanSerializer(serializers.Serializer): allow_null=True, default=None, queryset=User.objects.all(), ) push_to_jira = serializers.BooleanField(default=False) + import_execution_mode = serializers.ChoiceField( + required=False, + allow_null=True, + choices=IMPORT_EXECUTION_MODE_CHOICES, + help_text="Override how import post-processing (deduplication, jira push, grading, ...) is executed for " + "this request. 'async' dispatches post-processing to the background and responds immediately (default). " + "'async_wait' dispatches to the background but waits for deduplication to finish before responding, so " + "notifications and the returned statistics reflect the deduplicated state. 'sync' runs everything inline. " + "If omitted, falls back to the user's profile setting (import_execution_mode).", + ) environment = serializers.CharField(required=False) build_id = serializers.CharField( required=False, help_text="ID of the build that was scanned.", @@ -476,6 +488,14 @@ class CommonImportScanSerializer(serializers.Serializer): help_text=_("Also referred to as 'Organization' ID."), ) statistics = ImportStatisticsSerializer(read_only=True, required=False) + deduplication_complete = serializers.BooleanField( + read_only=True, + required=False, + help_text="Whether deduplication had finished by the time this response was produced. " + "True for 'sync' and for 'async_wait' when deduplication completed within the timeout; " + "False for 'async' (deduplication is still running in the background) or when an " + "'async_wait' import timed out waiting for it.", + ) pro = serializers.ListField(read_only=True, required=False) apply_tags_to_findings = serializers.BooleanField( help_text="If set to True, the tags will be applied to the findings", @@ -534,6 +554,7 @@ def process_scan( data["product_id"] = test.engagement.product.id data["product_type_id"] = test.engagement.product.prod_type.id data["statistics"] = {"after": test.statistics} + data["deduplication_complete"] = importer.deduplication_complete duration = time.perf_counter() - start_time LargeScanSizeProductAnnouncement(response_data=data, duration=duration) ScanTypeProductAnnouncement(response_data=data, scan_type=context.get("scan_type")) @@ -632,6 +653,14 @@ def setup_common_context(self, data: dict) -> dict: if eng_end_date: context["target_end"] = context.get("engagement_end_date") + # Resolve the effective import execution mode: request override (if any) + # takes precedence over the user's profile setting, otherwise default async. + request = self.context.get("request") + user = getattr(request, "user", None) + context["import_execution_mode"] = Dojo_User.resolve_import_execution_mode( + user, data.get("import_execution_mode"), + ) + return context @@ -805,11 +834,11 @@ def process_scan( try: logger.debug(f"process_scan called with context: {context}") start_time = time.perf_counter() + processor = None if test := context.get("test"): statistics_before = test.statistics - context["test"], _, _, _, _, _, test_import = self.get_reimporter( - **context, - ).process_scan( + processor = self.get_reimporter(**context) + context["test"], _, _, _, _, _, test_import = processor.process_scan( context.pop("scan", None), ) if test_import: @@ -821,9 +850,10 @@ def process_scan( # Do not close old findings when creating a brand new test: there are no # existing findings to compare against, and close_old_findings would # incorrectly close findings from other tests in the same scope. - context["test"], _, _, _, _, _, _ = self.get_importer( + processor = self.get_importer( **{**context, "close_old_findings": False}, - ).process_scan( + ) + context["test"], _, _, _, _, _, _ = processor.process_scan( context.pop("scan", None), ) else: @@ -842,6 +872,8 @@ def process_scan( if statistics_delta: data["statistics"]["delta"] = statistics_delta data["statistics"]["after"] = test.statistics + if processor is not None: + data["deduplication_complete"] = processor.deduplication_complete duration = time.perf_counter() - start_time LargeScanSizeProductAnnouncement(response_data=data, duration=duration) ScanTypeProductAnnouncement(response_data=data, scan_type=context.get("scan_type")) diff --git a/dojo/celery_dispatch.py b/dojo/celery_dispatch.py index 38a3b4fa06c..bd552eab7ea 100644 --- a/dojo/celery_dispatch.py +++ b/dojo/celery_dispatch.py @@ -61,10 +61,11 @@ def dojo_dispatch_task(task_or_sig: _SupportsSi | _SupportsApplyAsync | Signatur - Inject `async_user_id` if missing. - Capture and inject pghistory context if available. - - Respect `force_sync=True` (foreground execution) and user `block_execution`. + - Respect `force_sync=True` (foreground execution) and the user's + synchronous import_execution_mode. - Respect `force_async=True` (background execution even when the caller - would otherwise run synchronously, e.g. user has `block_execution`). - `force_async` wins over `force_sync` and `block_execution`. + would otherwise run synchronously, e.g. user on the synchronous mode). + `force_async` wins over `force_sync` and the user's mode. - Support `countdown=` for async dispatch. Returns: diff --git a/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py b/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py new file mode 100644 index 00000000000..220f0240b1a --- /dev/null +++ b/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('dojo', '0268_release_authorization_to_pro'), + ] + + operations = [ + migrations.AddField( + model_name='usercontactinfo', + name='import_execution_mode', + field=models.CharField(blank=True, choices=[('async', 'Async (do not wait)'), ('async_wait', 'Async, wait for deduplication'), ('sync', 'Synchronous (block)')], help_text="Controls how import/reimport post-processing is executed. 'Async' returns immediately (default). 'Async, wait for deduplication' runs post-processing in the background but waits for deduplication to finish before responding, so notifications and statistics are accurate. 'Synchronous' runs everything inline (and blocks all async tasks in the foreground for this user, like the old 'block execution' flag). Can be overridden per request.", max_length=20, null=True), + ), + ] diff --git a/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py b/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py new file mode 100644 index 00000000000..56bba7e2886 --- /dev/null +++ b/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py @@ -0,0 +1,28 @@ +from django.db import migrations + + +def block_execution_to_sync_mode(apps, schema_editor): + """Map the legacy block_execution=True flag to the synchronous import execution mode.""" + UserContactInfo = apps.get_model("dojo", "UserContactInfo") + UserContactInfo.objects.filter(block_execution=True).update(import_execution_mode="sync") + + +def sync_mode_to_block_execution(apps, schema_editor): + """Reverse: restore block_execution=True for users on the synchronous mode.""" + UserContactInfo = apps.get_model("dojo", "UserContactInfo") + UserContactInfo.objects.filter(import_execution_mode="sync").update(block_execution=True) + + +class Migration(migrations.Migration): + + dependencies = [ + ('dojo', '0269_usercontactinfo_import_execution_mode'), + ] + + operations = [ + migrations.RunPython(block_execution_to_sync_mode, sync_mode_to_block_execution), + migrations.RemoveField( + model_name='usercontactinfo', + name='block_execution', + ), + ] diff --git a/dojo/decorators.py b/dojo/decorators.py index cbe33de732d..7e533368f77 100644 --- a/dojo/decorators.py +++ b/dojo/decorators.py @@ -76,15 +76,15 @@ def we_want_async(*args, func=None, **kwargs): return True if Dojo_User.wants_block_execution(user): - logger.debug("dojo_async_task %s: running task in the foreground as block_execution is set to True for %s", func, user) + logger.debug("dojo_async_task %s: running task in the foreground as import_execution_mode is 'sync' for %s", func, user) return False - logger.debug("dojo_async_task %s: running task in the background as user has not set block_execution to True for %s", func, user) + logger.debug("dojo_async_task %s: running task in the background as import_execution_mode is not 'sync' for %s", func, user) return True # Defect Dojo performs all tasks asynchrnonously using celery -# *unless* the user initiating the task has set block_execution to True in their usercontactinfo profile +# *unless* the user initiating the task has set import_execution_mode to 'sync' in their usercontactinfo profile def dojo_async_task(func=None, *, signature=False): def decorator(func): @wraps(func) diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index d83d032b176..a5ae551f367 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -456,7 +456,13 @@ def post_process_finding_save_internal(finding, dedupe_option=True, rules_option jira_services.push(finding.finding_group) -@app.task +# ignore_result=False so the 'async_wait' import execution mode can join on the +# dispatched batch via AsyncResult.get() even when CELERY_TASK_IGNORE_RESULT=True. +# NOTE: this override may not be strictly necessary — in the past .get()/await +# appears to have worked with the global CELERY_TASK_IGNORE_RESULT=True and a +# Redis broker. Needs verification against a real broker/worker setup; if join +# works without it, this override can be removed to avoid storing extra results. +@app.task(ignore_result=False) def post_process_findings_batch( finding_ids, *args, diff --git a/dojo/fixtures/defect_dojo_sample_data.json b/dojo/fixtures/defect_dojo_sample_data.json index f4a44ab5459..313f065093d 100644 --- a/dojo/fixtures/defect_dojo_sample_data.json +++ b/dojo/fixtures/defect_dojo_sample_data.json @@ -687,7 +687,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -705,7 +704,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -723,7 +721,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, diff --git a/dojo/fixtures/defect_dojo_sample_data_locations.json b/dojo/fixtures/defect_dojo_sample_data_locations.json index f338fa62344..c6d1fa39037 100644 --- a/dojo/fixtures/defect_dojo_sample_data_locations.json +++ b/dojo/fixtures/defect_dojo_sample_data_locations.json @@ -687,7 +687,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -707,7 +706,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -727,7 +725,6 @@ }, { "fields": { - "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, diff --git a/dojo/fixtures/dojo_testdata.json b/dojo/fixtures/dojo_testdata.json index 50707a2d2bf..b1471612a4c 100644 --- a/dojo/fixtures/dojo_testdata.json +++ b/dojo/fixtures/dojo_testdata.json @@ -277,7 +277,6 @@ "title": null, "twitter_username": "#admin", "user": 1, - "block_execution": false, "github_username": null } }, @@ -291,7 +290,6 @@ "title": null, "twitter_username": null, "user": 2, - "block_execution": false, "github_username": null } }, @@ -305,7 +303,6 @@ "title": null, "twitter_username": null, "user": 3, - "block_execution": false, "github_username": null } }, diff --git a/dojo/fixtures/dojo_testdata_locations.json b/dojo/fixtures/dojo_testdata_locations.json index 3d4eb06ff9b..99e9e875953 100644 --- a/dojo/fixtures/dojo_testdata_locations.json +++ b/dojo/fixtures/dojo_testdata_locations.json @@ -277,7 +277,6 @@ "title": null, "twitter_username": "#admin", "user": 1, - "block_execution": false, "github_username": null } }, @@ -291,7 +290,6 @@ "title": null, "twitter_username": null, "user": 2, - "block_execution": false, "github_username": null } }, @@ -305,7 +303,6 @@ "title": null, "twitter_username": null, "user": 3, - "block_execution": false, "github_username": null } }, diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index fe015b610b0..b17887b4fb1 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -19,6 +19,8 @@ # Import History States IMPORT_CLOSED_FINDING, IMPORT_CREATED_FINDING, + IMPORT_EXECUTION_MODE_ASYNC_WAIT, + IMPORT_EXECUTION_MODE_SYNC, IMPORT_REACTIVATED_FINDING, IMPORT_UNTOUCHED_FINDING, # Finding Severities @@ -81,6 +83,79 @@ def __init__( self.pending_vulnerability_ids: list[Vulnerability_Id] = [] self.pending_vuln_id_deletes: list[int] = [] self.pending_burp_rr: list[BurpRawRequestResponse] = [] + # Handles for async post-processing tasks to await in 'async_wait' mode. + # Set after ImporterOptions.__init__ so it stays out of field_names + # (and the compress/decompress cycle used for async dispatch). + self.post_processing_results = [] + # Whether deduplication is known to be finished by the time the response + # is built. True for 'sync' (ran inline) and for 'async_wait' when all + # batches completed within the timeout; False for 'async' (dispatched, + # not awaited) or when an 'async_wait' join timed out/errored. + self.deduplication_complete = False + + def post_processing_dispatch_kwargs(self, **kwargs): + """ + Translate the resolved import execution mode into the force flags that + dojo_dispatch_task understands: + - SYNC: run inline in the web process (force_sync). + - ASYNC_WAIT: guarantee background dispatch (force_async) so we get a + handle to await, regardless of the user's profile mode. + - ASYNC (default): preserve historical behavior, honoring any externally + supplied force_sync and the user's sync mode via we_want_async. + """ + if self.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC: + return {"force_sync": True} + if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + return {"force_async": True} + return {"force_sync": kwargs.get("force_sync", False)} + + def record_post_processing_result(self, result): + """ + Remember an async post-processing dispatch handle so it can be awaited + later when running in the 'async_wait' execution mode. No-op for the + other modes (no handle is recorded by the caller). + """ + if not hasattr(self, "post_processing_results"): + self.post_processing_results = [] + if result is not None: + self.post_processing_results.append(result) + + def wait_for_post_processing(self): + """ + Block until the deduplication (and other batch) post-processing tasks + dispatched during this import have finished, so notifications and the + returned statistics reflect the deduplicated state. + + Only relevant in the 'async_wait' execution mode; bounded by + settings.IMPORT_ASYNC_WAIT_TIMEOUT so a stuck/missing worker degrades + to the historical (respond-anyway) behavior instead of hanging. + """ + if self.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC: + # Batches ran inline during process_findings, so dedup is already done. + self.deduplication_complete = True + return + if self.import_execution_mode != IMPORT_EXECUTION_MODE_ASYNC_WAIT: + # 'async': post-processing was dispatched but is not awaited. + self.deduplication_complete = False + return + results = getattr(self, "post_processing_results", None) or [] + if not results: + # Nothing was dispatched (e.g. empty import) — dedup is trivially done. + self.deduplication_complete = True + return + timeout = getattr(settings, "IMPORT_ASYNC_WAIT_TIMEOUT", 120) + logger.debug("async_wait: waiting for %d post-processing task(s) (timeout=%ss)", len(results), timeout) + success = True + for result in results: + if result is None or not hasattr(result, "get"): + continue + try: + result.get(timeout=timeout, propagate=False) + except Exception as e: + logger.warning("async_wait: error/timeout while waiting for post-processing task: %s", e) + success = False + self.deduplication_complete = success + self.post_processing_results = [] def check_child_implementation_exception(self): """ diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 6cdcb699024..cd7b65296cb 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -12,6 +12,7 @@ from dojo.importers.options import ImporterOptions from dojo.jira import services as jira_services from dojo.models import ( + IMPORT_EXECUTION_MODE_ASYNC_WAIT, Engagement, Finding, Test, @@ -146,6 +147,9 @@ def process_scan( url=reverse("view_test", args=(self.test.id,)), url_api=reverse("test-detail", args=(self.test.id,)), ) + # In 'async_wait' mode, block until background deduplication has finished + # so notifications and statistics reflect the deduplicated state. + self.wait_for_post_processing() updated_count = len(new_findings) + len(closed_findings) self.notify_scan_added( self.test, @@ -292,7 +296,7 @@ def _process_findings_internal( batch_finding_ids.clear() logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)", push_to_jira, len(finding_ids_batch), is_final_finding) - dojo_dispatch_task( + result = dojo_dispatch_task( finding_helper.post_process_findings_batch, finding_ids_batch, dedupe_option=True, @@ -300,8 +304,10 @@ def _process_findings_internal( product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira, - force_sync=kwargs.get("force_sync", False), + **self.post_processing_dispatch_kwargs(**kwargs), ) + if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + self.record_post_processing_result(result) # No chord: tasks are dispatched immediately above per batch diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 9defa8352f2..346f6ccb23e 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -18,6 +18,7 @@ from dojo.importers.options import ImporterOptions from dojo.jira import services as jira_services from dojo.models import ( + IMPORT_EXECUTION_MODE_ASYNC_WAIT, Development_Environment, Finding, Notes, @@ -142,6 +143,9 @@ def process_scan( ) # Send out som notifications to the user logger.debug("REIMPORT_SCAN: Generating notifications") + # In 'async_wait' mode, block until background deduplication has finished + # so notifications and statistics reflect the deduplicated state. + self.wait_for_post_processing() updated_count = ( len(closed_findings) + len(reactivated_findings) + len(new_findings) ) @@ -458,7 +462,7 @@ def _process_findings_internal( batch_findings.clear() finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() - dojo_dispatch_task( + result = dojo_dispatch_task( finding_helper.post_process_findings_batch, finding_ids_batch, dedupe_option=True, @@ -467,8 +471,10 @@ def _process_findings_internal( issue_updater_option=True, push_to_jira=push_to_jira, jira_instance_id=getattr(self.jira_instance, "id", None), - force_sync=kwargs.get("force_sync", False), + **self.post_processing_dispatch_kwargs(**kwargs), ) + if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + self.record_post_processing_result(result) # No chord: tasks are dispatched immediately above per batch diff --git a/dojo/importers/options.py b/dojo/importers/options.py index 02cecff1113..3817a96fe08 100644 --- a/dojo/importers/options.py +++ b/dojo/importers/options.py @@ -12,6 +12,9 @@ from dojo.jira.services import get_instance as get_jira_instance from dojo.models import ( + IMPORT_EXECUTION_MODE_ASYNC, + IMPORT_EXECUTION_MODE_SYNC, + IMPORT_EXECUTION_MODES, Development_Environment, Dojo_User, Endpoint, @@ -68,6 +71,7 @@ def load_base_options( self.engagement: Engagement | None = self.validate_engagement(*args, **kwargs) self.environment: Development_Environment | None = self.validate_environment(*args, **kwargs) self.group_by: str = self.validate_group_by(*args, **kwargs) + self.import_execution_mode: str = self.validate_import_execution_mode(*args, **kwargs) self.import_type: str = self.validate_import_type(*args, **kwargs) self.lead: Dojo_User | None = self.validate_lead(*args, **kwargs) self.minimum_severity: str = self.validate_minimum_severity(*args, **kwargs) @@ -345,6 +349,25 @@ def validate_do_not_reactivate( **kwargs, ) + def validate_import_execution_mode( + self, + *args: list, + **kwargs: dict, + ) -> str: + mode = self.validate( + "import_execution_mode", + expected_types=[str], + required=False, + default=IMPORT_EXECUTION_MODE_ASYNC, + **kwargs, + ) + if mode not in IMPORT_EXECUTION_MODES: + mode = IMPORT_EXECUTION_MODE_ASYNC + # An explicit force_sync from a non-serializer caller still wins. + if kwargs.get("force_sync"): + mode = IMPORT_EXECUTION_MODE_SYNC + return mode + def validate_commit_hash( self, *args: list, diff --git a/dojo/middleware.py b/dojo/middleware.py index a576244312a..73e0ed601dd 100644 --- a/dojo/middleware.py +++ b/dojo/middleware.py @@ -281,8 +281,8 @@ def _drain_search_context_to_async(objects, source): for model_name, pk_list in model_groups.items(): batches = [pk_list[i:i + batch_size] for i in range(0, len(pk_list), batch_size)] # force_async=True keeps indexing off the request path even for users - # with block_execution=True — index updates are slow and never need - # to be synchronous from the user's perspective. + # on the synchronous import_execution_mode — index updates are slow and + # never need to be synchronous from the user's perspective. for i, batch in enumerate(batches, 1): logger.debug(f"{source}: Triggering batch {i}/{len(batches)} for {model_name}: {len(batch)} instances") dojo_dispatch_task(update_watson_search_index_for_model, model_name, batch, force_async=True) diff --git a/dojo/models.py b/dojo/models.py index c4d3d0aaa68..2e2b3f899d2 100644 --- a/dojo/models.py +++ b/dojo/models.py @@ -123,7 +123,16 @@ def __call__(self, model_instance, filename): from dojo.regulations.models import Regulation # noqa: E402, F401, I001 -- re-export; user/system_settings block intentionally out-of-order (load-order) -from dojo.user.models import Contact, Dojo_User, UserContactInfo # noqa: E402, F401 -- must precede system_settings (middleware load-order) +from dojo.user.models import ( # noqa: E402, F401 -- must precede system_settings (middleware load-order) + IMPORT_EXECUTION_MODE_ASYNC, + IMPORT_EXECUTION_MODE_ASYNC_WAIT, + IMPORT_EXECUTION_MODE_CHOICES, + IMPORT_EXECUTION_MODE_SYNC, + IMPORT_EXECUTION_MODES, + Contact, + Dojo_User, + UserContactInfo, +) from dojo.system_settings.models import System_Settings # noqa: E402, F401 -- re-export diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py index 4c0f161768a..93361398f04 100644 --- a/dojo/settings/settings.dist.py +++ b/dojo/settings/settings.dist.py @@ -96,6 +96,9 @@ DD_CELERY_BROKER_PARAMS=(str, ""), DD_CELERY_BROKER_TRANSPORT_OPTIONS=(str, ""), DD_CELERY_TASK_IGNORE_RESULT=(bool, True), + # Max seconds the 'async_wait' import execution mode will wait for background + # deduplication/post-processing to finish before responding anyway. + DD_IMPORT_ASYNC_WAIT_TIMEOUT=(int, 120), DD_CELERY_RESULT_BACKEND=(str, "django-db"), DD_CELERY_RESULT_EXPIRES=(int, 86400), DD_CELERY_BEAT_SCHEDULE_FILENAME=(str, root("dojo.celery.beat.db")), @@ -864,6 +867,7 @@ def generate_url(scheme, double_slashes, user, password, host, port, path, param params=env("DD_CELERY_BROKER_PARAMS"), ) CELERY_TASK_IGNORE_RESULT = env("DD_CELERY_TASK_IGNORE_RESULT") +IMPORT_ASYNC_WAIT_TIMEOUT = env("DD_IMPORT_ASYNC_WAIT_TIMEOUT") CELERY_RESULT_BACKEND = env("DD_CELERY_RESULT_BACKEND") CELERY_TIMEZONE = TIME_ZONE CELERY_RESULT_EXPIRES = env("DD_CELERY_RESULT_EXPIRES") diff --git a/dojo/templates/dojo/view_user.html b/dojo/templates/dojo/view_user.html index 4f1d0a3b04d..47542577c0d 100644 --- a/dojo/templates/dojo/view_user.html +++ b/dojo/templates/dojo/view_user.html @@ -279,13 +279,9 @@

- {% trans "Block execution" %} + {% trans "Import execution mode" %} - {% if user.usercontactinfo.block_execution %} - - {% else %} - - {% endif %} + {{ user.usercontactinfo.get_import_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/templates_classic/dojo/view_user.html b/dojo/templates_classic/dojo/view_user.html index e1fe9917722..4d6ac25220f 100644 --- a/dojo/templates_classic/dojo/view_user.html +++ b/dojo/templates_classic/dojo/view_user.html @@ -279,13 +279,9 @@

- {% trans "Block execution" %} + {% trans "Import execution mode" %} - {% if user.usercontactinfo.block_execution %} - - {% else %} - - {% endif %} + {{ user.usercontactinfo.get_import_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/user/models.py b/dojo/user/models.py index ea18bbc840e..02e9f7fbbed 100644 --- a/dojo/user/models.py +++ b/dojo/user/models.py @@ -6,6 +6,28 @@ User = get_user_model() +# Import post-processing execution modes. +# - ASYNC: post-processing (dedup, jira, grading, ...) runs in the background; +# the API responds immediately (default, historical behavior). +# - ASYNC_WAIT: post-processing is dispatched to the background as usual, but the +# request waits for the deduplication batches to finish before responding, so +# notifications and the returned statistics reflect the deduplicated state. +# - SYNC: post-processing runs inline in the web process (legacy block_execution). +IMPORT_EXECUTION_MODE_ASYNC = "async" +IMPORT_EXECUTION_MODE_ASYNC_WAIT = "async_wait" +IMPORT_EXECUTION_MODE_SYNC = "sync" +IMPORT_EXECUTION_MODES = ( + IMPORT_EXECUTION_MODE_ASYNC, + IMPORT_EXECUTION_MODE_ASYNC_WAIT, + IMPORT_EXECUTION_MODE_SYNC, +) +IMPORT_EXECUTION_MODE_CHOICES = ( + (IMPORT_EXECUTION_MODE_ASYNC, _("Async (do not wait)")), + (IMPORT_EXECUTION_MODE_ASYNC_WAIT, _("Async, wait for deduplication")), + (IMPORT_EXECUTION_MODE_SYNC, _("Synchronous (block)")), +) + + # proxy class for convenience and UI class Dojo_User(User): class Meta: @@ -20,8 +42,25 @@ def __str__(self): @staticmethod def wants_block_execution(user): - # this return False if there is no user, i.e. in celery processes, unittests, etc. - return hasattr(user, "usercontactinfo") and user.usercontactinfo.block_execution + # this returns False if there is no user, i.e. in celery processes, unittests, etc. + # The synchronous import execution mode is the successor of the old block_execution + # flag and governs whether async tasks run in the foreground for this user. + return hasattr(user, "usercontactinfo") and user.usercontactinfo.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC + + @staticmethod + def resolve_import_execution_mode(user, override=None): + """ + Resolve the effective import post-processing execution mode. + + Priority: explicit request override > user profile setting > default async. + Returns one of IMPORT_EXECUTION_MODE_ASYNC / _ASYNC_WAIT / _SYNC. + """ + if override in IMPORT_EXECUTION_MODES: + return override + info = getattr(user, "usercontactinfo", None) + if info is not None and info.import_execution_mode in IMPORT_EXECUTION_MODES: + return info.import_execution_mode + return IMPORT_EXECUTION_MODE_ASYNC @staticmethod def force_password_reset(user): @@ -62,7 +101,21 @@ class UserContactInfo(models.Model): github_username = models.CharField(blank=True, null=True, max_length=150) slack_username = models.CharField(blank=True, null=True, max_length=150, help_text=_("Email address associated with your slack account"), verbose_name=_("Slack Email Address")) slack_user_id = models.CharField(blank=True, null=True, max_length=25) - block_execution = models.BooleanField(default=False, help_text=_("Instead of async deduping a finding the findings will be deduped synchronously and will 'block' the user until completion.")) + import_execution_mode = models.CharField( + max_length=20, + choices=IMPORT_EXECUTION_MODE_CHOICES, + null=True, + blank=True, + help_text=_( + "Controls how import/reimport post-processing is executed. " + "'Async' returns immediately (default). 'Async, wait for deduplication' " + "runs post-processing in the background but waits for deduplication to " + "finish before responding, so notifications and statistics are accurate. " + "'Synchronous' runs everything inline (and blocks all async tasks in the " + "foreground for this user, like the old 'block execution' flag). Can be " + "overridden per request.", + ), + ) force_password_reset = models.BooleanField(default=False, help_text=_("Forces this user to reset their password on next login.")) ui_use_tailwind = models.BooleanField(default=False, verbose_name=_("Use new UI (beta)"), help_text=_("Opt in to the new Tailwind-based UI. Leave off for the classic UI.")) token_last_reset = models.DateTimeField(null=True, blank=True, help_text=_("Timestamp of the most recent API token reset for this user.")) diff --git a/dojo/user/ui/forms.py b/dojo/user/ui/forms.py index d32c3da187e..4024a4e51b7 100644 --- a/dojo/user/ui/forms.py +++ b/dojo/user/ui/forms.py @@ -80,7 +80,7 @@ class Meta: # Swap order: password_last_reset before token_last_reset field_order = [ "title", "phone_number", "cell_number", "twitter_username", "github_username", - "slack_username", "ui_use_tailwind", "block_execution", "force_password_reset", "reset_api_token", + "slack_username", "ui_use_tailwind", "import_execution_mode", "force_password_reset", "reset_api_token", "password_last_reset", "token_last_reset", ] diff --git a/tests/base_test_class.py b/tests/base_test_class.py index d9043a43fd5..2798d2f2a58 100644 --- a/tests/base_test_class.py +++ b/tests/base_test_class.py @@ -10,7 +10,7 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select, WebDriverWait # import time logging.basicConfig( @@ -329,22 +329,24 @@ def enable_github(self): return self.enable_system_setting("id_enable_github") def set_block_execution(self, *, block_execution=True): - # we set the admin user (ourselves) to have block_execution checked - # this will force dedupe to happen synchronously, among other things like notifications, rules, ... - logger.info("setting block execution to: %s", block_execution) + # we set the admin user (ourselves) to the synchronous import execution mode + # (the successor of the old block_execution flag) when block_execution=True. + # This forces dedupe to happen synchronously, among other things like + # notifications, rules, ... Otherwise we select the default async mode. + target_mode = "sync" if block_execution else "async" + logger.info("setting import execution mode to: %s", target_mode) driver = self.driver driver.get(self.base_url + "profile") - if ( - driver.find_element(By.ID, "id_block_execution").is_selected() - != block_execution - ): - driver.find_element(By.XPATH, '//*[@id="id_block_execution"]').click() + select = Select(driver.find_element(By.ID, "id_import_execution_mode")) + if select.first_selected_option.get_attribute("value") != target_mode: + select.select_by_value(target_mode) # save settings driver.find_element(By.CSS_SELECTOR, "input.btn.btn-primary").click() - # check if it's enabled after reload + # check if it's applied after reload + select = Select(driver.find_element(By.ID, "id_import_execution_mode")) self.assertEqual( - driver.find_element(By.ID, "id_block_execution").is_selected(), - block_execution, + select.first_selected_option.get_attribute("value"), + target_mode, ) return driver diff --git a/unittests/test_async_delete.py b/unittests/test_async_delete.py index 7aa8f0769a1..ae0105791e7 100644 --- a/unittests/test_async_delete.py +++ b/unittests/test_async_delete.py @@ -27,21 +27,21 @@ class TestAsyncDelete(DojoTestCase): """ Test async_delete functionality with dojo_dispatch_task kwargs injection. - These tests use block_execution=True and crum.impersonate to run tasks synchronously, + These tests use import_execution_mode="sync" and crum.impersonate to run tasks synchronously, which allows errors to surface immediately rather than being lost in background workers. """ def setUp(self): - """Set up test user with block_execution=True and disable unneeded features.""" + """Set up test user with import_execution_mode="sync" and disable unneeded features.""" super().setUp() - # Create test user with block_execution=True to run tasks synchronously + # Create test user with import_execution_mode="sync" to run tasks synchronously self.testuser = User.objects.create( username="test_async_delete_user", is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, block_execution=True) + UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") # Log in as the test user (for API client) self.client.force_login(self.testuser) diff --git a/unittests/test_cascade_delete.py b/unittests/test_cascade_delete.py index f24667e44f8..8f8a4244610 100644 --- a/unittests/test_cascade_delete.py +++ b/unittests/test_cascade_delete.py @@ -37,7 +37,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, block_execution=True) + UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_celery_dispatch_force_async.py b/unittests/test_celery_dispatch_force_async.py index 71170e1bb09..7cf523d78d2 100644 --- a/unittests/test_celery_dispatch_force_async.py +++ b/unittests/test_celery_dispatch_force_async.py @@ -3,7 +3,7 @@ `force_async=True` is for callers (e.g. the watson async indexer middleware) that should always run their celery task in the background even when the -current user has `block_execution=True` or the caller passes `force_sync=True`. +current user has `import_execution_mode="sync"` or the caller passes `force_sync=True`. """ from unittest.mock import patch diff --git a/unittests/test_deduplication_logic.py b/unittests/test_deduplication_logic.py index fd6b5d2847c..e1f5e08d05e 100644 --- a/unittests/test_deduplication_logic.py +++ b/unittests/test_deduplication_logic.py @@ -162,7 +162,7 @@ class TestDuplicationLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_duplication_loops.py b/unittests/test_duplication_loops.py index a3a70cf5c2b..fabeb376de0 100644 --- a/unittests/test_duplication_loops.py +++ b/unittests/test_duplication_loops.py @@ -19,7 +19,7 @@ class TestDuplicationLoops(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_false_positive_history_logic.py b/unittests/test_false_positive_history_logic.py index 5975348e14e..bc8ebb56641 100644 --- a/unittests/test_false_positive_history_logic.py +++ b/unittests/test_false_positive_history_logic.py @@ -131,7 +131,7 @@ class TestFalsePositiveHistoryLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # Unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_finding_helper.py b/unittests/test_finding_helper.py index f6d9e747ff2..2e50424719b 100644 --- a/unittests/test_finding_helper.py +++ b/unittests/test_finding_helper.py @@ -256,7 +256,7 @@ def setUp(self): super().setUp() self.system_settings(enable_jira=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.block_execution = True + self.testuser.usercontactinfo.import_execution_mode = "sync" self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_finding_model.py b/unittests/test_finding_model.py index 136a5e0e5f8..4d2cb4b2761 100644 --- a/unittests/test_finding_model.py +++ b/unittests/test_finding_model.py @@ -558,7 +558,7 @@ class TestFindingSLAExpiration(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py new file mode 100644 index 00000000000..568439f7ff6 --- /dev/null +++ b/unittests/test_import_execution_mode.py @@ -0,0 +1,147 @@ +from django.test import override_settings + +from dojo.importers.default_importer import DefaultImporter +from dojo.models import ( + IMPORT_EXECUTION_MODE_ASYNC, + IMPORT_EXECUTION_MODE_ASYNC_WAIT, + IMPORT_EXECUTION_MODE_SYNC, + Development_Environment, + Dojo_User, + Engagement, + UserContactInfo, +) + +from .dojo_test_case import DojoAPITestCase, DojoTestCase, get_unit_tests_path + + +class ImportExecutionModeResolverTest(DojoTestCase): + + """resolve_import_execution_mode: request override > profile > default.""" + + fixtures = ["dojo_testdata.json"] + + def setUp(self): + self.user = Dojo_User.objects.get(username="admin") + UserContactInfo.objects.filter(user=self.user).delete() + + def _set_profile(self, *, mode=None): + UserContactInfo.objects.update_or_create( + user=self.user, + defaults={"import_execution_mode": mode}, + ) + self.user.refresh_from_db() + + def test_default_is_async(self): + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user)) + + def test_request_override_wins_over_profile(self): + self._set_profile(mode=IMPORT_EXECUTION_MODE_SYNC) + self.assertEqual( + IMPORT_EXECUTION_MODE_ASYNC_WAIT, + Dojo_User.resolve_import_execution_mode(self.user, IMPORT_EXECUTION_MODE_ASYNC_WAIT), + ) + + def test_profile_mode_used_when_no_override(self): + self._set_profile(mode=IMPORT_EXECUTION_MODE_ASYNC_WAIT) + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC_WAIT, Dojo_User.resolve_import_execution_mode(self.user)) + + def test_empty_profile_falls_back_to_async(self): + self._set_profile(mode=None) + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user)) + + def test_invalid_override_ignored(self): + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user, "garbage")) + + def test_no_user(self): + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(None)) + + def test_wants_block_execution_only_for_sync_mode(self): + self._set_profile(mode=IMPORT_EXECUTION_MODE_SYNC) + self.assertTrue(Dojo_User.wants_block_execution(self.user)) + self._set_profile(mode=IMPORT_EXECUTION_MODE_ASYNC_WAIT) + self.assertFalse(Dojo_User.wants_block_execution(self.user)) + self._set_profile(mode=None) + self.assertFalse(Dojo_User.wants_block_execution(self.user)) + + +class ImporterDispatchKwargsTest(DojoTestCase): + + """import_execution_mode -> dojo_dispatch_task force flags.""" + + fixtures = ["dojo_testdata.json"] + + def _importer(self, mode, **extra): + return DefaultImporter( + scan_type="ZAP Scan", + engagement=Engagement.objects.first(), + environment=Development_Environment.objects.first(), + import_execution_mode=mode, + **extra, + ) + + def test_sync_mode_forces_sync(self): + self.assertEqual({"force_sync": True}, self._importer(IMPORT_EXECUTION_MODE_SYNC).post_processing_dispatch_kwargs()) + + def test_async_wait_mode_forces_async(self): + self.assertEqual({"force_async": True}, self._importer(IMPORT_EXECUTION_MODE_ASYNC_WAIT).post_processing_dispatch_kwargs()) + + def test_async_mode_preserves_external_force_sync(self): + importer = self._importer(IMPORT_EXECUTION_MODE_ASYNC) + self.assertEqual({"force_sync": False}, importer.post_processing_dispatch_kwargs()) + self.assertEqual({"force_sync": True}, importer.post_processing_dispatch_kwargs(force_sync=True)) + + def test_invalid_mode_defaults_to_async(self): + self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, self._importer("nonsense").import_execution_mode) + + def test_external_force_sync_promotes_to_sync_mode(self): + importer = self._importer(IMPORT_EXECUTION_MODE_ASYNC, force_sync=True) + self.assertEqual(IMPORT_EXECUTION_MODE_SYNC, importer.import_execution_mode) + + +@override_settings(CELERY_TASK_ALWAYS_EAGER=True) +class ImportExecutionModeAPITest(DojoAPITestCase): + + """ + End-to-end: the import endpoints accept and honor import_execution_mode. + + CELERY_TASK_ALWAYS_EAGER runs dispatched tasks inline against the test DB, so + 'async_wait' can actually join its deduplication batch (a real broker/worker + runs against a different DB and could never see the test transaction's data). + """ + + fixtures = ["dojo_testdata.json"] + + def setUp(self): + super().setUp() + self.login_as_admin() + + def _payload(self, mode): + return { + "minimum_severity": "Low", + "scan_type": "ZAP Scan", + "engagement": 1, + "import_execution_mode": mode, + } + + def test_import_async_wait_returns_statistics(self): + with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: + payload = self._payload(IMPORT_EXECUTION_MODE_ASYNC_WAIT) + payload["file"] = testfile + result = self.import_scan(payload, 201) + self.assertIn("statistics", result) + self.assertIn("after", result["statistics"]) + # async_wait joins deduplication, so it must report completion + self.assertTrue(result["deduplication_complete"]) + + def test_import_async_does_not_await_deduplication(self): + with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: + payload = self._payload(IMPORT_EXECUTION_MODE_ASYNC) + payload["file"] = testfile + result = self.import_scan(payload, 201) + self.assertFalse(result["deduplication_complete"]) + + def test_import_rejects_invalid_mode(self): + with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: + payload = self._payload("not-a-mode") + payload["file"] = testfile + self.import_scan(payload, 400) diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 417fe0ea9ea..26c70f2a52e 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -2323,7 +2323,7 @@ def __init__(self, *args, **kwargs): def setUp(self): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') @@ -3122,7 +3122,7 @@ def __init__(self, *args, **kwargs): def setUp(self): # still using the API to verify results testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') diff --git a/unittests/test_importers_deduplication.py b/unittests/test_importers_deduplication.py index 7c1359dcc36..0bd2a71568c 100644 --- a/unittests/test_importers_deduplication.py +++ b/unittests/test_importers_deduplication.py @@ -37,7 +37,7 @@ def setUp(self): testuser.is_superuser = True testuser.is_staff = True testuser.save() - UserContactInfo.objects.create(user=testuser, block_execution=True) + UserContactInfo.objects.create(user=testuser, import_execution_mode="sync") # Authenticate API client as admin for import endpoints self.login_as_admin() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 47d30a99824..1d011c747bb 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -67,7 +67,7 @@ def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.update_or_create(user=testuser, defaults={"block_execution": False}) + UserContactInfo.objects.update_or_create(user=testuser, defaults={"import_execution_mode": None}) self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) @@ -363,7 +363,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self._import_reimport_performance( @@ -387,7 +387,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -541,7 +541,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self._deduplication_performance( @@ -653,7 +653,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self._import_reimport_performance( @@ -677,7 +677,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -805,7 +805,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self._deduplication_performance( diff --git a/unittests/test_jira_config_engagement.py b/unittests/test_jira_config_engagement.py index aaabfdddc5e..7b8beb6e8ea 100644 --- a/unittests/test_jira_config_engagement.py +++ b/unittests/test_jira_config_engagement.py @@ -252,7 +252,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.block_execution = True + self.user.usercontactinfo.import_execution_mode = "sync" self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_config_engagement_epic.py b/unittests/test_jira_config_engagement_epic.py index 614a10fbe57..6f314b9748c 100644 --- a/unittests/test_jira_config_engagement_epic.py +++ b/unittests/test_jira_config_engagement_epic.py @@ -39,7 +39,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.block_execution = True + self.user.usercontactinfo.import_execution_mode = "sync" self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_import_and_pushing_api.py b/unittests/test_jira_import_and_pushing_api.py index eb3f0692dbc..7d472633363 100644 --- a/unittests/test_jira_import_and_pushing_api.py +++ b/unittests/test_jira_import_and_pushing_api.py @@ -73,7 +73,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.system_settings(enable_webhooks_notifications=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.block_execution = True + self.testuser.usercontactinfo.import_execution_mode = "sync" self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_notifications.py b/unittests/test_notifications.py index 504e168e70b..b9153ada311 100644 --- a/unittests/test_notifications.py +++ b/unittests/test_notifications.py @@ -641,9 +641,9 @@ class TestAsyncNotificationTaskBody(DojoTestCase): def run(self, result=None): # Same sync pattern used by TestNotificationWebhooks: run under an impersonated user - # with block_execution=True so downstream dojo_dispatch_task calls execute inline. + # with import_execution_mode="sync" so downstream dojo_dispatch_task calls execute inline. testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() with impersonate(testuser): super().run(result) @@ -677,8 +677,8 @@ def test_async_task_returns_silently_on_missing_instance(self, mock_process): @patch("dojo.notifications.helper.create_notification") def test_dispatch_respects_block_execution(self, mock_create): - """With block_execution=True on the impersonated user, the post_save signal runs the task body inline.""" - # The run() wrapper impersonates admin with block_execution=True, so + """With import_execution_mode="sync" on the impersonated user, the post_save signal runs the task body inline.""" + # The run() wrapper impersonates admin with import_execution_mode="sync", so # dojo_dispatch_task takes the sync branch and the task body calls create_notification # (module-level helper inside async_create_notification) synchronously. prod = Product.objects.first() @@ -705,7 +705,7 @@ def run(self, result=None): if getattr(self, "__unittest_skip__", False): return super().run(result) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_prepare_duplicates_for_delete.py b/unittests/test_prepare_duplicates_for_delete.py index 89ed84069db..2981abe796b 100644 --- a/unittests/test_prepare_duplicates_for_delete.py +++ b/unittests/test_prepare_duplicates_for_delete.py @@ -45,7 +45,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, block_execution=True) + UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_product_grading.py b/unittests/test_product_grading.py index 5f4680c3315..4f929233b81 100644 --- a/unittests/test_product_grading.py +++ b/unittests/test_product_grading.py @@ -12,7 +12,7 @@ class ProductGradeTest(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index cc5b2f40e1e..fb6bff558c2 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -72,7 +72,7 @@ class ReimportDuplicateFindingsTestBase(DojoTestCase): def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.get_or_create(user=testuser, defaults={"block_execution": True}) + UserContactInfo.objects.get_or_create(user=testuser, defaults={"import_execution_mode": "sync"}) self.system_settings(enable_deduplication=True) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_tag_inheritance.py b/unittests/test_tag_inheritance.py index f1f9dce8505..ba9ac82adda 100644 --- a/unittests/test_tag_inheritance.py +++ b/unittests/test_tag_inheritance.py @@ -713,7 +713,7 @@ class InheritedTagsImportTestAPI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False @@ -731,7 +731,7 @@ class InheritedTagsImportTestUI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False diff --git a/unittests/test_tags.py b/unittests/test_tags.py index ea460f247bf..533d9d356b1 100644 --- a/unittests/test_tags.py +++ b/unittests/test_tags.py @@ -387,7 +387,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() TagImportMixin.setUp(self) @@ -404,7 +404,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.block_execution = True + testuser.usercontactinfo.import_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() self.client_ui = Client() diff --git a/unittests/test_watson_async_search_index.py b/unittests/test_watson_async_search_index.py index 8edba606ac7..fe54d61c0f8 100644 --- a/unittests/test_watson_async_search_index.py +++ b/unittests/test_watson_async_search_index.py @@ -20,7 +20,7 @@ def setUp(self): super().setUp() self.testuser = User.objects.create(username="admin", is_staff=True, is_superuser=True) - UserContactInfo.objects.create(user=self.testuser, block_execution=True) + UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) From 6ae08c5ad85ba3ceeb5d230a0bc705716fa9db44 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 10:30:54 +0200 Subject: [PATCH 02/29] feat(importers): make scan_added notifications dedup-aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit notify_scan_added used the importer's in-memory finding lists, whose duplicate flag is stale because deduplication runs on separately-fetched instances. Once deduplication has completed (sync or async_wait, signalled by deduplication_complete), refresh the duplicate flag from the database and split each action list into "real" and duplicate findings: - findings_new -> net-new (excludes deduplicated) - findings_new_duplicate / findings_reactivated_duplicate / findings_untouched_duplicate - finding_count -> recomputed to exclude new/reactivated duplicates (so an all-duplicate import sends scan_added_empty) In plain async mode (dedup not awaited) the lists are left untouched, preserving historical behavior. Mail and webhook scan_added templates render the new duplicate sections. Note: import/reimport response statistics are already dedup-accurate once awaited — Test.statistics / Test_Import.statistics query Finding and annotate the duplicate count directly; only the notification used stale in-memory data. --- dojo/importers/base_importer.py | 44 +++++++++++ .../notifications/mail/scan_added.tpl | 33 ++++++++ .../notifications/webhooks/scan_added.tpl | 8 +- unittests/test_import_execution_mode.py | 76 +++++++++++++++++++ 4 files changed, 160 insertions(+), 1 deletion(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index b17887b4fb1..a82eea5d73a 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -955,10 +955,49 @@ def notify_scan_added( new_findings = [] logger.debug("Scan added notifications") + # When deduplication has finished (synchronous mode, or async_wait after the + # join), the in-memory findings still carry their pre-dedup duplicate=False + # flag because deduplication runs on separately-fetched instances. Refresh the + # flag from the database and split each list into "real" and duplicate findings + # so the notification reflects post-dedup reality instead of counting/listing + # deduplicated findings as brand new. In plain async mode dedup has not run yet, + # so we leave the lists untouched (best-effort, historical behavior). + findings_new_duplicate: list[Finding] = [] + findings_reactivated_duplicate: list[Finding] = [] + findings_untouched_duplicate: list[Finding] = [] + if getattr(self, "deduplication_complete", False): + all_ids = [f.id for f in (*new_findings, *findings_reactivated, *findings_untouched)] + duplicate_ids = set() + if all_ids: + duplicate_ids = set( + Finding.objects.filter(id__in=all_ids, duplicate=True).values_list("id", flat=True), + ) + + def _split(findings): + kept, duplicates = [], [] + for finding in findings: + if finding.id in duplicate_ids: + # refresh the in-memory flag so any template logic is correct + finding.duplicate = True + duplicates.append(finding) + else: + kept.append(finding) + return kept, duplicates + + new_findings, findings_new_duplicate = _split(new_findings) + findings_reactivated, findings_reactivated_duplicate = _split(findings_reactivated) + findings_untouched, findings_untouched_duplicate = _split(findings_untouched) + # Recompute the headline count to exclude findings that turned out to be + # duplicates of an existing finding (they are not genuinely new activity). + updated_count = len(new_findings) + len(findings_reactivated) + len(findings_mitigated) + new_findings = sorted(new_findings, key=lambda x: x.numerical_severity) findings_mitigated = sorted(findings_mitigated, key=lambda x: x.numerical_severity) findings_reactivated = sorted(findings_reactivated, key=lambda x: x.numerical_severity) findings_untouched = sorted(findings_untouched, key=lambda x: x.numerical_severity) + findings_new_duplicate = sorted(findings_new_duplicate, key=lambda x: x.numerical_severity) + findings_reactivated_duplicate = sorted(findings_reactivated_duplicate, key=lambda x: x.numerical_severity) + findings_untouched_duplicate = sorted(findings_untouched_duplicate, key=lambda x: x.numerical_severity) title = ( f"Created/Updated {updated_count} findings for {test.engagement.product}: {test.engagement.name}: {test}" @@ -975,6 +1014,11 @@ def notify_scan_added( engagement=test.engagement, product=test.engagement.product, findings_untouched=findings_untouched, + # Findings deduplicated during post-processing, split by their import action. + # Populated only once deduplication has completed (sync / async_wait). + findings_new_duplicate=findings_new_duplicate, + findings_reactivated_duplicate=findings_reactivated_duplicate, + findings_untouched_duplicate=findings_untouched_duplicate, url=reverse("view_test", args=(test.id,)), url_api=reverse("test-detail", args=(test.id,)), ) diff --git a/dojo/templates_classic/notifications/mail/scan_added.tpl b/dojo/templates_classic/notifications/mail/scan_added.tpl index 263585246e0..567d80ee47e 100644 --- a/dojo/templates_classic/notifications/mail/scan_added.tpl +++ b/dojo/templates_classic/notifications/mail/scan_added.tpl @@ -26,6 +26,17 @@ {% endfor %}

+ {% if findings_new_duplicate %} +

+

+ {% blocktranslate %}New findings detected as duplicates{% endblocktranslate %} ({{ findings_new_duplicate | length }})
+ {% for finding in findings_new_duplicate %} + {% url 'view_finding' finding.id as finding_url %} + {{ finding.title }} ({{ finding.severity }})
+ {% endfor %} +
+

+ {% endif %}

{% blocktranslate %}Reactivated findings{% endblocktranslate %} ({{ findings_reactivated | length }})
@@ -37,6 +48,17 @@ {% endfor %}

+ {% if findings_reactivated_duplicate %} +

+

+ {% blocktranslate %}Reactivated findings detected as duplicates{% endblocktranslate %} ({{ findings_reactivated_duplicate | length }})
+ {% for finding in findings_reactivated_duplicate %} + {% url 'view_finding' finding.id as finding_url %} + {{ finding.title }} ({{ finding.severity }})
+ {% endfor %} +
+

+ {% endif %}

{% blocktranslate %}Closed findings{% endblocktranslate %} ({{ findings_mitigated | length }})
@@ -59,6 +81,17 @@ {% endfor %}

+ {% if findings_untouched_duplicate %} +

+

+ {% blocktranslate %}Existing findings detected as duplicates{% endblocktranslate %} ({{ findings_untouched_duplicate | length }})
+ {% for finding in findings_untouched_duplicate %} + {% url 'view_finding' finding.id as finding_url %} + {{ finding.title }} ({{ finding.severity }})
+ {% endfor %} +
+

+ {% endif %}

{% trans "Kind regards" %},

diff --git a/dojo/templates_classic/notifications/webhooks/scan_added.tpl b/dojo/templates_classic/notifications/webhooks/scan_added.tpl index b42096bfba2..0f68a72eb12 100644 --- a/dojo/templates_classic/notifications/webhooks/scan_added.tpl +++ b/dojo/templates_classic/notifications/webhooks/scan_added.tpl @@ -8,5 +8,11 @@ findings: {% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_reactivated %} mitigated: {% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_mitigated %} - untouched: + untouched: {% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_untouched %} + new_duplicate: +{% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_new_duplicate %} + reactivated_duplicate: +{% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_reactivated_duplicate %} + untouched_duplicate: +{% include 'notifications/webhooks/subtemplates/findings_list.tpl' with findings=findings_untouched_duplicate %} diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py index 568439f7ff6..a9232907df5 100644 --- a/unittests/test_import_execution_mode.py +++ b/unittests/test_import_execution_mode.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + from django.test import override_settings from dojo.importers.default_importer import DefaultImporter @@ -8,6 +10,8 @@ Development_Environment, Dojo_User, Engagement, + Finding, + Test, UserContactInfo, ) @@ -145,3 +149,75 @@ def test_import_rejects_invalid_mode(self): payload = self._payload("not-a-mode") payload["file"] = testfile self.import_scan(payload, 400) + + +class NotificationDeduplicationRefreshTest(DojoTestCase): + + """notify_scan_added refreshes duplicate status from the DB once dedup is complete.""" + + fixtures = ["dojo_testdata.json"] + + def _importer(self): + test = Test.objects.first() + importer = DefaultImporter( + scan_type="ZAP Scan", + engagement=test.engagement, + environment=Development_Environment.objects.first(), + ) + return importer, test + + @patch("dojo.importers.base_importer.create_notification") + def test_deduplicated_new_findings_excluded_when_complete(self, mock_notify): + importer, test = self._importer() + importer.deduplication_complete = True + + real = Finding(test=test, title="real finding", severity="High") + real.save() + dupe = Finding(test=test, title="dupe finding", severity="High") + dupe.save() + # Simulate background deduplication having flagged the second finding. + Finding.objects.filter(pk=dupe.pk).update(duplicate=True) + + importer.notify_scan_added(test, updated_count=2, new_findings=[real, dupe]) + + kwargs = mock_notify.call_args.kwargs + self.assertEqual([f.id for f in kwargs["findings_new"]], [real.id]) + self.assertEqual([f.id for f in kwargs["findings_new_duplicate"]], [dupe.id]) + # headline count excludes the deduplicated finding + self.assertEqual(kwargs["finding_count"], 1) + self.assertEqual(kwargs["event"], "scan_added") + + @patch("dojo.importers.base_importer.create_notification") + def test_async_mode_does_not_refresh(self, mock_notify): + importer, test = self._importer() + importer.deduplication_complete = False # plain async: dedup not awaited + + dupe = Finding(test=test, title="async dupe", severity="High") + dupe.save() + Finding.objects.filter(pk=dupe.pk).update(duplicate=True) + + importer.notify_scan_added(test, updated_count=1, new_findings=[dupe]) + + kwargs = mock_notify.call_args.kwargs + # historical behavior: duplicate still listed/counted as new + self.assertEqual([f.id for f in kwargs["findings_new"]], [dupe.id]) + self.assertEqual(kwargs["findings_new_duplicate"], []) + self.assertEqual(kwargs["finding_count"], 1) + + @patch("dojo.importers.base_importer.create_notification") + def test_all_new_findings_duplicate_yields_empty_event(self, mock_notify): + importer, test = self._importer() + importer.deduplication_complete = True + + dupe = Finding(test=test, title="only dupe", severity="Low") + dupe.save() + Finding.objects.filter(pk=dupe.pk).update(duplicate=True) + + importer.notify_scan_added(test, updated_count=1, new_findings=[dupe]) + + kwargs = mock_notify.call_args.kwargs + self.assertEqual(kwargs["findings_new"], []) + self.assertEqual([f.id for f in kwargs["findings_new_duplicate"]], [dupe.id]) + self.assertEqual(kwargs["finding_count"], 0) + # net-new is zero -> empty scan notification + self.assertEqual(kwargs["event"], "scan_added_empty") From 06d6404d0f9a3938ee594537b276f96c29d49547 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 14:47:37 +0200 Subject: [PATCH 03/29] refactor(importers): await dedup before all import notifications Move wait_for_post_processing() ahead of the test_added notification dispatch (not just notify_scan_added) so both notifications sent during a fresh import are emitted after deduplication has completed in async_wait mode. The reimporter already orders the await before its single notification. --- dojo/importers/default_importer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index cd7b65296cb..341e8c959b9 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -135,6 +135,10 @@ def process_scan( new_findings=new_findings, closed_findings=closed_findings, ) + # In 'async_wait' mode, block until background deduplication has finished + # so notifications and statistics reflect the deduplicated state. + self.wait_for_post_processing() + # Send out some notifications to the user logger.debug("IMPORT_SCAN: Generating notifications") dojo_dispatch_task( @@ -147,9 +151,6 @@ def process_scan( url=reverse("view_test", args=(self.test.id,)), url_api=reverse("test-detail", args=(self.test.id,)), ) - # In 'async_wait' mode, block until background deduplication has finished - # so notifications and statistics reflect the deduplicated state. - self.wait_for_post_processing() updated_count = len(new_findings) + len(closed_findings) self.notify_scan_added( self.test, From 54807417954f9424643bba9511922958036a29f9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 14:53:44 +0200 Subject: [PATCH 04/29] refactor: rename import_execution_mode to deduplication_execution_mode The mode governs whether the request waits for deduplication, so name it for that. Rename the UserContactInfo field, the request/serializer field, the ImporterOptions attribute and validator, the resolver, and the IMPORT_EXECUTION_MODE_* constants to DEDUPLICATION_EXECUTION_MODE_*. Add a RenameField migration (0271) rather than mutating the add/remove migrations introduced earlier on this branch. --- CLAUDE.local.md | 36 ++ dojo/api_v2/serializers.py | 12 +- dojo/celery_dispatch.py | 2 +- ...ame_import_execution_mode_deduplication.py | 16 + dojo/decorators.py | 6 +- dojo/importers/base_importer.py | 12 +- dojo/importers/default_importer.py | 4 +- dojo/importers/default_reimporter.py | 4 +- dojo/importers/options.py | 20 +- dojo/middleware.py | 2 +- dojo/models.py | 10 +- dojo/sql_stacktrace.py | 51 +++ dojo/templates/dojo/view_user.html | 2 +- dojo/templates_classic/dojo/view_user.html | 2 +- dojo/user/models.py | 40 +-- dojo/user/ui/forms.py | 2 +- fix-jira-push-eligibility.md | 111 ++++++ phase10-session-prompt.md | 69 ++++ scripts/run-nuclei-docker.sh | 40 +++ tagoulus-replcement-options.md | 319 ++++++++++++++++++ unittests/test_async_delete.py | 8 +- unittests/test_cascade_delete.py | 2 +- unittests/test_celery_dispatch_force_async.py | 2 +- unittests/test_deduplication_logic.py | 2 +- unittests/test_duplication_loops.py | 2 +- .../test_false_positive_history_logic.py | 2 +- unittests/test_finding_helper.py | 2 +- unittests/test_finding_model.py | 2 +- unittests/test_import_execution_mode.py | 56 +-- unittests/test_import_reimport.py | 4 +- unittests/test_importers_deduplication.py | 2 +- unittests/test_importers_performance.py | 14 +- unittests/test_jira_config_engagement.py | 2 +- unittests/test_jira_config_engagement_epic.py | 2 +- unittests/test_jira_import_and_pushing_api.py | 2 +- unittests/test_notifications.py | 10 +- .../test_prepare_duplicates_for_delete.py | 2 +- unittests/test_product_grading.py | 2 +- unittests/test_reimport_prefetch.py | 2 +- unittests/test_tag_inheritance.py | 4 +- unittests/test_tags.py | 4 +- unittests/test_watson_async_search_index.py | 2 +- 42 files changed, 766 insertions(+), 124 deletions(-) create mode 100644 CLAUDE.local.md create mode 100644 dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py create mode 100644 dojo/sql_stacktrace.py create mode 100644 fix-jira-push-eligibility.md create mode 100644 phase10-session-prompt.md create mode 100644 scripts/run-nuclei-docker.sh create mode 100644 tagoulus-replcement-options.md diff --git a/CLAUDE.local.md b/CLAUDE.local.md new file mode 100644 index 00000000000..584b0167d4d --- /dev/null +++ b/CLAUDE.local.md @@ -0,0 +1,36 @@ +# Claude Code Instructions + +Follow all guidelines in [AGENTS.md](AGENTS.md). Key points repeated here for emphasis: + +## Pull Request Descriptions + +- Include a **Summary** section only — bullet points describing what changed and why. +- **Do NOT add any of the following sections**: "Test plan", "Testing", "How to test", or any other testing-related section. DefectDojo's PR template does not use one. +- **NEVER overwrite an existing PR description** without first reading it with `gh pr view --json body -q '.body'`. +- **NEVER mention DefectDojo Pro** in PRs, issues, or comments targeting the open source repo. +- **Always edit PR bodies via a file**, never inline. `gh pr edit --body "$(cat <<'EOF' ... EOF)"` silently exits 1 on this repo because the Projects (classic) GraphQL deprecation warning trips gh's error path. Use one of: + - `gh pr edit --body-file /tmp/pr_body.md` + - `gh api -X PATCH /repos///pulls/ --input ` (REST, fully bypasses the GraphQL warning) + Always verify with `gh pr view --json body -q '.body'` after — `gh pr edit` may print no error but still leave the body unchanged. + +## PR URLs + +- Always format PR URLs as markdown links: `[PR #123](https://github.com/owner/repo/pull/123)` + +## Git Commits + +- **NEVER commit `CLAUDE.md`** — it is a local instruction file, not part of the upstream codebase. +- **NEVER commit files from `.claude/`** — these are local session artifacts. + +## Running Unit Tests + +- **Always use `./run-unittest.sh`** to run unit tests — never `python -m pytest` or `python manage.py test` directly: + ```bash + ./run-unittest.sh --test-case unittests.test_module.TestClass 2>&1 | tee /tmp/test_output.log + ``` +- **ALWAYS pipe output through `tee` to capture it to a file.** This is mandatory — it lets you grep the results without re-running expensive tests. +- After running, analyze with `grep` on the captured file, not by re-running: + ```bash + grep -E "PASSED|FAILED|ERROR|error" /tmp/test_output.log + ``` +- **Narrate every test iteration**: report pass/fail immediately after each run, explain what failed and what you plan to fix before making changes, then explain what you changed before re-running. diff --git a/dojo/api_v2/serializers.py b/dojo/api_v2/serializers.py index a1d804a72d0..80485f7e8c4 100644 --- a/dojo/api_v2/serializers.py +++ b/dojo/api_v2/serializers.py @@ -24,8 +24,8 @@ from dojo.importers.default_reimporter import DefaultReImporter from dojo.location.models import Location from dojo.models import ( + DEDUPLICATION_EXECUTION_MODE_CHOICES, IMPORT_ACTIONS, - IMPORT_EXECUTION_MODE_CHOICES, SEVERITIES, SEVERITY_CHOICES, STATS_FIELDS, @@ -433,15 +433,15 @@ class CommonImportScanSerializer(serializers.Serializer): allow_null=True, default=None, queryset=User.objects.all(), ) push_to_jira = serializers.BooleanField(default=False) - import_execution_mode = serializers.ChoiceField( + deduplication_execution_mode = serializers.ChoiceField( required=False, allow_null=True, - choices=IMPORT_EXECUTION_MODE_CHOICES, + choices=DEDUPLICATION_EXECUTION_MODE_CHOICES, help_text="Override how import post-processing (deduplication, jira push, grading, ...) is executed for " "this request. 'async' dispatches post-processing to the background and responds immediately (default). " "'async_wait' dispatches to the background but waits for deduplication to finish before responding, so " "notifications and the returned statistics reflect the deduplicated state. 'sync' runs everything inline. " - "If omitted, falls back to the user's profile setting (import_execution_mode).", + "If omitted, falls back to the user's profile setting (deduplication_execution_mode).", ) environment = serializers.CharField(required=False) build_id = serializers.CharField( @@ -657,8 +657,8 @@ def setup_common_context(self, data: dict) -> dict: # takes precedence over the user's profile setting, otherwise default async. request = self.context.get("request") user = getattr(request, "user", None) - context["import_execution_mode"] = Dojo_User.resolve_import_execution_mode( - user, data.get("import_execution_mode"), + context["deduplication_execution_mode"] = Dojo_User.resolve_deduplication_execution_mode( + user, data.get("deduplication_execution_mode"), ) return context diff --git a/dojo/celery_dispatch.py b/dojo/celery_dispatch.py index bd552eab7ea..3171291d5ee 100644 --- a/dojo/celery_dispatch.py +++ b/dojo/celery_dispatch.py @@ -62,7 +62,7 @@ def dojo_dispatch_task(task_or_sig: _SupportsSi | _SupportsApplyAsync | Signatur - Inject `async_user_id` if missing. - Capture and inject pghistory context if available. - Respect `force_sync=True` (foreground execution) and the user's - synchronous import_execution_mode. + synchronous deduplication_execution_mode. - Respect `force_async=True` (background execution even when the caller would otherwise run synchronously, e.g. user on the synchronous mode). `force_async` wins over `force_sync` and the user's mode. diff --git a/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py b/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py new file mode 100644 index 00000000000..51812be16d3 --- /dev/null +++ b/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py @@ -0,0 +1,16 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('dojo', '0270_remove_usercontactinfo_block_execution'), + ] + + operations = [ + migrations.RenameField( + model_name='usercontactinfo', + old_name='import_execution_mode', + new_name='deduplication_execution_mode', + ), + ] diff --git a/dojo/decorators.py b/dojo/decorators.py index 7e533368f77..d938c6e2661 100644 --- a/dojo/decorators.py +++ b/dojo/decorators.py @@ -76,15 +76,15 @@ def we_want_async(*args, func=None, **kwargs): return True if Dojo_User.wants_block_execution(user): - logger.debug("dojo_async_task %s: running task in the foreground as import_execution_mode is 'sync' for %s", func, user) + logger.debug("dojo_async_task %s: running task in the foreground as deduplication_execution_mode is 'sync' for %s", func, user) return False - logger.debug("dojo_async_task %s: running task in the background as import_execution_mode is not 'sync' for %s", func, user) + logger.debug("dojo_async_task %s: running task in the background as deduplication_execution_mode is not 'sync' for %s", func, user) return True # Defect Dojo performs all tasks asynchrnonously using celery -# *unless* the user initiating the task has set import_execution_mode to 'sync' in their usercontactinfo profile +# *unless* the user initiating the task has set deduplication_execution_mode to 'sync' in their usercontactinfo profile def dojo_async_task(func=None, *, signature=False): def decorator(func): @wraps(func) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index a82eea5d73a..13617794ca6 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -16,11 +16,11 @@ from dojo.jira.services import is_keep_in_sync from dojo.location.models import Location from dojo.models import ( + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_SYNC, # Import History States IMPORT_CLOSED_FINDING, IMPORT_CREATED_FINDING, - IMPORT_EXECUTION_MODE_ASYNC_WAIT, - IMPORT_EXECUTION_MODE_SYNC, IMPORT_REACTIVATED_FINDING, IMPORT_UNTOUCHED_FINDING, # Finding Severities @@ -103,9 +103,9 @@ def post_processing_dispatch_kwargs(self, **kwargs): - ASYNC (default): preserve historical behavior, honoring any externally supplied force_sync and the user's sync mode via we_want_async. """ - if self.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC: + if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_SYNC: return {"force_sync": True} - if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: return {"force_async": True} return {"force_sync": kwargs.get("force_sync", False)} @@ -130,11 +130,11 @@ def wait_for_post_processing(self): settings.IMPORT_ASYNC_WAIT_TIMEOUT so a stuck/missing worker degrades to the historical (respond-anyway) behavior instead of hanging. """ - if self.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC: + if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_SYNC: # Batches ran inline during process_findings, so dedup is already done. self.deduplication_complete = True return - if self.import_execution_mode != IMPORT_EXECUTION_MODE_ASYNC_WAIT: + if self.deduplication_execution_mode != DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: # 'async': post-processing was dispatched but is not awaited. self.deduplication_complete = False return diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 341e8c959b9..d7d6d6e09a8 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -12,7 +12,7 @@ from dojo.importers.options import ImporterOptions from dojo.jira import services as jira_services from dojo.models import ( - IMPORT_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, Engagement, Finding, Test, @@ -307,7 +307,7 @@ def _process_findings_internal( push_to_jira=push_to_jira, **self.post_processing_dispatch_kwargs(**kwargs), ) - if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: self.record_post_processing_result(result) # No chord: tasks are dispatched immediately above per batch diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 346f6ccb23e..9845c77b227 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -18,7 +18,7 @@ from dojo.importers.options import ImporterOptions from dojo.jira import services as jira_services from dojo.models import ( - IMPORT_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, Development_Environment, Finding, Notes, @@ -473,7 +473,7 @@ def _process_findings_internal( jira_instance_id=getattr(self.jira_instance, "id", None), **self.post_processing_dispatch_kwargs(**kwargs), ) - if self.import_execution_mode == IMPORT_EXECUTION_MODE_ASYNC_WAIT: + if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: self.record_post_processing_result(result) # No chord: tasks are dispatched immediately above per batch diff --git a/dojo/importers/options.py b/dojo/importers/options.py index 3817a96fe08..d25f1a68b29 100644 --- a/dojo/importers/options.py +++ b/dojo/importers/options.py @@ -12,9 +12,9 @@ from dojo.jira.services import get_instance as get_jira_instance from dojo.models import ( - IMPORT_EXECUTION_MODE_ASYNC, - IMPORT_EXECUTION_MODE_SYNC, - IMPORT_EXECUTION_MODES, + DEDUPLICATION_EXECUTION_MODE_ASYNC, + DEDUPLICATION_EXECUTION_MODE_SYNC, + DEDUPLICATION_EXECUTION_MODES, Development_Environment, Dojo_User, Endpoint, @@ -71,7 +71,7 @@ def load_base_options( self.engagement: Engagement | None = self.validate_engagement(*args, **kwargs) self.environment: Development_Environment | None = self.validate_environment(*args, **kwargs) self.group_by: str = self.validate_group_by(*args, **kwargs) - self.import_execution_mode: str = self.validate_import_execution_mode(*args, **kwargs) + self.deduplication_execution_mode: str = self.validate_deduplication_execution_mode(*args, **kwargs) self.import_type: str = self.validate_import_type(*args, **kwargs) self.lead: Dojo_User | None = self.validate_lead(*args, **kwargs) self.minimum_severity: str = self.validate_minimum_severity(*args, **kwargs) @@ -349,23 +349,23 @@ def validate_do_not_reactivate( **kwargs, ) - def validate_import_execution_mode( + def validate_deduplication_execution_mode( self, *args: list, **kwargs: dict, ) -> str: mode = self.validate( - "import_execution_mode", + "deduplication_execution_mode", expected_types=[str], required=False, - default=IMPORT_EXECUTION_MODE_ASYNC, + default=DEDUPLICATION_EXECUTION_MODE_ASYNC, **kwargs, ) - if mode not in IMPORT_EXECUTION_MODES: - mode = IMPORT_EXECUTION_MODE_ASYNC + if mode not in DEDUPLICATION_EXECUTION_MODES: + mode = DEDUPLICATION_EXECUTION_MODE_ASYNC # An explicit force_sync from a non-serializer caller still wins. if kwargs.get("force_sync"): - mode = IMPORT_EXECUTION_MODE_SYNC + mode = DEDUPLICATION_EXECUTION_MODE_SYNC return mode def validate_commit_hash( diff --git a/dojo/middleware.py b/dojo/middleware.py index 73e0ed601dd..d9cdd626b1e 100644 --- a/dojo/middleware.py +++ b/dojo/middleware.py @@ -281,7 +281,7 @@ def _drain_search_context_to_async(objects, source): for model_name, pk_list in model_groups.items(): batches = [pk_list[i:i + batch_size] for i in range(0, len(pk_list), batch_size)] # force_async=True keeps indexing off the request path even for users - # on the synchronous import_execution_mode — index updates are slow and + # on the synchronous deduplication_execution_mode — index updates are slow and # never need to be synchronous from the user's perspective. for i, batch in enumerate(batches, 1): logger.debug(f"{source}: Triggering batch {i}/{len(batches)} for {model_name}: {len(batch)} instances") diff --git a/dojo/models.py b/dojo/models.py index 2e2b3f899d2..a58fa3c5a74 100644 --- a/dojo/models.py +++ b/dojo/models.py @@ -124,11 +124,11 @@ def __call__(self, model_instance, filename): from dojo.regulations.models import Regulation # noqa: E402, F401, I001 -- re-export; user/system_settings block intentionally out-of-order (load-order) from dojo.user.models import ( # noqa: E402, F401 -- must precede system_settings (middleware load-order) - IMPORT_EXECUTION_MODE_ASYNC, - IMPORT_EXECUTION_MODE_ASYNC_WAIT, - IMPORT_EXECUTION_MODE_CHOICES, - IMPORT_EXECUTION_MODE_SYNC, - IMPORT_EXECUTION_MODES, + DEDUPLICATION_EXECUTION_MODE_ASYNC, + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_CHOICES, + DEDUPLICATION_EXECUTION_MODE_SYNC, + DEDUPLICATION_EXECUTION_MODES, Contact, Dojo_User, UserContactInfo, diff --git a/dojo/sql_stacktrace.py b/dojo/sql_stacktrace.py new file mode 100644 index 00000000000..eeb44385f46 --- /dev/null +++ b/dojo/sql_stacktrace.py @@ -0,0 +1,51 @@ +""" +Temporary debugging helper: django-sql-stacktrace-style execute_wrapper. + +Enabled only when the env var DD_SQL_STACKTRACE is set. When active it +registers a connection.execute_wrapper on every DB connection that, for any +SQL matching DD_SQL_STACKTRACE_FILTER (default: 'pro_enhanced_finding'), +prints the originating Python traceback to stderr. Used to pinpoint the exact +call site emitting the per-finding deduplication queries in the importer perf +tests. NOT for production — remove before merge. +""" +import os +import sys +import traceback + +from django.db.backends.signals import connection_created +from django.dispatch import receiver + +_FILTER = os.environ.get("DD_SQL_STACKTRACE_FILTER", "pro_enhanced_finding") +_counter = {"n": 0} + + +def _stacktrace_wrapper(execute, sql, params, many, context): + if _FILTER in sql: + _counter["n"] += 1 + stack = "".join(traceback.format_stack()[:-1]) + sys.stderr.write( + f"\n===== DD_SQL_STACKTRACE hit #{_counter['n']} (filter={_FILTER!r}) =====\n" + f"SQL: {sql[:200]}\n" + f"{stack}" + f"===== end DD_SQL_STACKTRACE hit #{_counter['n']} =====\n", + ) + sys.stderr.flush() + return execute(sql, params, many, context) + + +@receiver(connection_created) +def _install_wrapper(sender, connection, **kwargs): + # execute_wrappers persist for the life of the connection; guard against + # double-registration if the signal fires more than once for a connection. + if getattr(connection, "_dd_sql_stacktrace_installed", False): + return + connection._dd_sql_stacktrace_installed = True + connection.execute_wrappers.append(_stacktrace_wrapper) + + +def maybe_enable(): + """Touch the module so the connection_created receiver is connected. + + Returns True when the DD_SQL_STACKTRACE env var requested activation. + """ + return bool(os.environ.get("DD_SQL_STACKTRACE")) diff --git a/dojo/templates/dojo/view_user.html b/dojo/templates/dojo/view_user.html index 47542577c0d..acc31927a48 100644 --- a/dojo/templates/dojo/view_user.html +++ b/dojo/templates/dojo/view_user.html @@ -281,7 +281,7 @@

{% trans "Import execution mode" %} - {{ user.usercontactinfo.get_import_execution_mode_display|default:_("Async (do not wait)") }} + {{ user.usercontactinfo.get_deduplication_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/templates_classic/dojo/view_user.html b/dojo/templates_classic/dojo/view_user.html index 4d6ac25220f..238694e1c5f 100644 --- a/dojo/templates_classic/dojo/view_user.html +++ b/dojo/templates_classic/dojo/view_user.html @@ -281,7 +281,7 @@

{% trans "Import execution mode" %} - {{ user.usercontactinfo.get_import_execution_mode_display|default:_("Async (do not wait)") }} + {{ user.usercontactinfo.get_deduplication_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/user/models.py b/dojo/user/models.py index 02e9f7fbbed..780d5c0b4ab 100644 --- a/dojo/user/models.py +++ b/dojo/user/models.py @@ -13,18 +13,18 @@ # request waits for the deduplication batches to finish before responding, so # notifications and the returned statistics reflect the deduplicated state. # - SYNC: post-processing runs inline in the web process (legacy block_execution). -IMPORT_EXECUTION_MODE_ASYNC = "async" -IMPORT_EXECUTION_MODE_ASYNC_WAIT = "async_wait" -IMPORT_EXECUTION_MODE_SYNC = "sync" -IMPORT_EXECUTION_MODES = ( - IMPORT_EXECUTION_MODE_ASYNC, - IMPORT_EXECUTION_MODE_ASYNC_WAIT, - IMPORT_EXECUTION_MODE_SYNC, +DEDUPLICATION_EXECUTION_MODE_ASYNC = "async" +DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT = "async_wait" +DEDUPLICATION_EXECUTION_MODE_SYNC = "sync" +DEDUPLICATION_EXECUTION_MODES = ( + DEDUPLICATION_EXECUTION_MODE_ASYNC, + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_SYNC, ) -IMPORT_EXECUTION_MODE_CHOICES = ( - (IMPORT_EXECUTION_MODE_ASYNC, _("Async (do not wait)")), - (IMPORT_EXECUTION_MODE_ASYNC_WAIT, _("Async, wait for deduplication")), - (IMPORT_EXECUTION_MODE_SYNC, _("Synchronous (block)")), +DEDUPLICATION_EXECUTION_MODE_CHOICES = ( + (DEDUPLICATION_EXECUTION_MODE_ASYNC, _("Async (do not wait)")), + (DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, _("Async, wait for deduplication")), + (DEDUPLICATION_EXECUTION_MODE_SYNC, _("Synchronous (block)")), ) @@ -45,22 +45,22 @@ def wants_block_execution(user): # this returns False if there is no user, i.e. in celery processes, unittests, etc. # The synchronous import execution mode is the successor of the old block_execution # flag and governs whether async tasks run in the foreground for this user. - return hasattr(user, "usercontactinfo") and user.usercontactinfo.import_execution_mode == IMPORT_EXECUTION_MODE_SYNC + return hasattr(user, "usercontactinfo") and user.usercontactinfo.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_SYNC @staticmethod - def resolve_import_execution_mode(user, override=None): + def resolve_deduplication_execution_mode(user, override=None): """ Resolve the effective import post-processing execution mode. Priority: explicit request override > user profile setting > default async. - Returns one of IMPORT_EXECUTION_MODE_ASYNC / _ASYNC_WAIT / _SYNC. + Returns one of DEDUPLICATION_EXECUTION_MODE_ASYNC / _ASYNC_WAIT / _SYNC. """ - if override in IMPORT_EXECUTION_MODES: + if override in DEDUPLICATION_EXECUTION_MODES: return override info = getattr(user, "usercontactinfo", None) - if info is not None and info.import_execution_mode in IMPORT_EXECUTION_MODES: - return info.import_execution_mode - return IMPORT_EXECUTION_MODE_ASYNC + if info is not None and info.deduplication_execution_mode in DEDUPLICATION_EXECUTION_MODES: + return info.deduplication_execution_mode + return DEDUPLICATION_EXECUTION_MODE_ASYNC @staticmethod def force_password_reset(user): @@ -101,9 +101,9 @@ class UserContactInfo(models.Model): github_username = models.CharField(blank=True, null=True, max_length=150) slack_username = models.CharField(blank=True, null=True, max_length=150, help_text=_("Email address associated with your slack account"), verbose_name=_("Slack Email Address")) slack_user_id = models.CharField(blank=True, null=True, max_length=25) - import_execution_mode = models.CharField( + deduplication_execution_mode = models.CharField( max_length=20, - choices=IMPORT_EXECUTION_MODE_CHOICES, + choices=DEDUPLICATION_EXECUTION_MODE_CHOICES, null=True, blank=True, help_text=_( diff --git a/dojo/user/ui/forms.py b/dojo/user/ui/forms.py index 4024a4e51b7..751eec14b13 100644 --- a/dojo/user/ui/forms.py +++ b/dojo/user/ui/forms.py @@ -80,7 +80,7 @@ class Meta: # Swap order: password_last_reset before token_last_reset field_order = [ "title", "phone_number", "cell_number", "twitter_username", "github_username", - "slack_username", "ui_use_tailwind", "import_execution_mode", "force_password_reset", "reset_api_token", + "slack_username", "ui_use_tailwind", "deduplication_execution_mode", "force_password_reset", "reset_api_token", "password_last_reset", "token_last_reset", ] diff --git a/fix-jira-push-eligibility.md b/fix-jira-push-eligibility.md new file mode 100644 index 00000000000..d9ba4dee1fb --- /dev/null +++ b/fix-jira-push-eligibility.md @@ -0,0 +1,111 @@ +# Fix: JIRA push eligibility check blocks unrelated PATCH requests + +## Bug + +PATCH requests on findings (e.g. EPSS enrichment) fail with HTTP 400: + +``` +"Finding: X cannot be pushed to JIRA: Findings must be active and verified, + if enforced by system settings, to be pushed to JIRA." +``` + +The EPSS update is rolled back due to `ATOMIC_REQUESTS`. + +## Root cause + +Two PRs introduced between 2.52.3 and 2.55.4 combined to cause this: + +### PR #14262 — "Jira keep findings in sync" (2.55.2) + +`FindingSerializer.update()` in `dojo/api_v2/serializers.py` was changed to +trigger a JIRA push even when `push_to_jira` was not explicitly set: + +```python +# before +if push_to_jira: + jira_helper.push_to_jira(instance) + +# after +if push_to_jira or finding_helper.is_keep_in_sync_with_jira(instance): + jira_helper.push_to_jira(instance, sync=True) +``` + +This means any PATCH — including EPSS enrichment — triggers a JIRA push when: +- `jira_project.push_all_issues` is True (resolved in `perform_update` before + calling `serializer.save`), OR +- `finding_jira_sync` is enabled on the JIRA instance and the finding (or its + group) already has a JIRA issue + +### PR #14320 — "Fix Jira error handling" (2.55.3) + +`push_to_jira` return value (previously ignored) is now checked, and any +failure raises a `ValidationError`: + +```python +# after +success, message = jira_helper.push_to_jira(instance, sync=True) +if not success: + raise serializers.ValidationError(message) +``` + +`add_jira_issue` in `helper.py` already classifies "not active/verified" and +"below threshold" as *expected* eligibility failures and logs them at INFO +level — but it still returns `(False, message)`, which the serializer now +unconditionally raises as a 400. + +## Existing pattern + +`dojo/finding/views.py` (bulk edit, lines ~2963-3024) already does this +correctly: it calls `can_be_pushed_to_jira` as a guard before attempting the +push. If ineligible, it logs and skips — no exception is raised. + +## Fix + +### 1. `dojo/api_v2/views.py` — `FindingViewSet.perform_update()` + +Capture whether the push was *explicitly* requested before merging with +`push_all_issues`: + +```python +push_to_jira = serializer.validated_data.get("push_to_jira") +push_to_jira_explicit = bool(push_to_jira) # explicit before OR +jira_project = jira_helper.get_jira_project(serializer.instance) +if get_system_setting("enable_jira") and jira_project: + push_to_jira = push_to_jira or jira_project.push_all_issues +serializer.save(push_to_jira=push_to_jira, push_to_jira_explicit=push_to_jira_explicit) +``` + +### 2. `dojo/api_v2/serializers.py` — `FindingSerializer.update()` + +Guard with `can_be_pushed_to_jira` (same pattern as bulk edit view). +Only raise `ValidationError` for eligibility failures when the push was +explicitly requested by the caller; auto-triggered pushes skip silently: + +```python +push_to_jira = validated_data.pop("push_to_jira") +push_to_jira_explicit = validated_data.pop("push_to_jira_explicit", False) + +... + +if push_to_jira or finding_helper.is_keep_in_sync_with_jira(instance): + can_push, error_message, _error_code = jira_helper.can_be_pushed_to_jira(instance) + if can_push: + # Push synchronously so that we can see jira errors in real time + success, message = jira_helper.push_to_jira(instance, sync=True) + if not success: + raise serializers.ValidationError(message) + elif push_to_jira_explicit: + # User explicitly asked to push — surface the eligibility error + raise serializers.ValidationError(error_message) + # else: auto-triggered (push_all_issues / finding_jira_sync) but finding + # is not eligible → silently skip, same as bulk edit view +``` + +## Behaviour after fix + +| Trigger | Finding eligible | Result | +|---|---|---| +| Explicit `push_to_jira=true` | yes | push, raise on push failure | +| Explicit `push_to_jira=true` | no | raise 400 (expected, user asked) | +| Auto (`push_all_issues` / sync) | yes | push, raise on push failure | +| Auto (`push_all_issues` / sync) | no | silently skip, PATCH succeeds | diff --git a/phase10-session-prompt.md b/phase10-session-prompt.md new file mode 100644 index 00000000000..e98128429d8 --- /dev/null +++ b/phase10-session-prompt.md @@ -0,0 +1,69 @@ +Implement "Phase 10: Peripheral Model Modules — 10-PR Stack Continuation" from AGENTS.md. +Read that section first — it is the complete source of truth (bundles, line ranges, +stack/cascade mechanics, gotchas). Also read the Phase 0–9 playbook above it and follow +CLAUDE.local.md (run-unittest.sh + tee, PR body rules, never commit CLAUDE.md/.claude/). + +## Goal +Finish the reorg by moving the ~45 peripheral model classes still in dojo/models.py into +their domain modules, as a full vertical slice (Phases 1–9) per module. Continue the existing +stack on top of reorg/finding-models (#14974). + +## Order (bottom-up, do not parallelize across branches — they all touch the same monoliths) +1. FIRST: fold CWE + BurpRawRequestResponse into dojo/finding/ on the EXISTING branch + reorg/finding-models (#14974). No new PR. Commit, force-push --force-with-lease to upstream. +2. Then PRs 6–10, each branched from its predecessor's tip: + - reorg/peripheral-user (Bundle A) ← reorg/finding-models + - reorg/peripheral-tools-endpoint (Bundle B) ← peripheral-user + - reorg/peripheral-survey-benchmark(Bundle C) ← peripheral-tools-endpoint + - reorg/peripheral-notes-files (Bundle D) ← peripheral-survey-benchmark + - reorg/peripheral-misc (Bundle E) ← peripheral-notes-files + Each: push to UPSTREAM, open as a DRAFT PR (gh pr create --draft --repo + DefectDojo/django-DefectDojo --base --head ). + +## Subagent strategy (cost + context, NOT parallelism) +All modules edit the same monolith files (dojo/models.py, forms.py, filters.py, +api_v2/serializers.py, api_v2/views.py, urls.py, apps.py). NEVER run two module subagents +concurrently — they would clobber each other's edits to those shared files. Process modules +ONE AT A TIME, bottom-up. + +Use Sonnet subagents to offload mechanical bulk work and preserve your (Opus) context: +- You run Phase 0 pre-flight per module (grep consumers — don't trust memorized lists). +- Spawn ONE Sonnet subagent (model: sonnet) per module, sequentially. It does: move models+admin, + add re-export stubs, string-ref cross-bundle FKs, move forms→ui/forms.py, filters→ui/filters.py, + views/urls→ui/, and the API layer if one exists. Give it exact line ranges from AGENTS.md + the + matching template (dojo/finding/, dojo/product/, dojo/test/, dojo/engagement/, or dojo/url// + dojo/location/ for models-only). +- YOU handle the judgment parts: circular-import resolution, serializer get_fields() lazy + cycle-breaks + extend_schema_field set_override, prefetcher full-reexport, shared-base + decisions (FindingTagStringFilter trap). +- After each module: YOU run verify gates (docker manage.py check, makemigrations --check, + ruff check, ./run-unittest.sh --test-case unittests. 2>&1 | tee /tmp/test_.log, + grep the log). A Sonnet agent does not get to declare success — you confirm via gates before + moving on. + +## Per-bundle gotchas (also in AGENTS.md) +- survey & benchmark: NO api_v2 serializers/viewsets → skip Phases 6–9 for them (confirm w/ Phase 0). +- Question/Answer base classes live inside a `with warnings.catch_warnings():` block — preserve it. +- Benchmark_Requirement → M2M CWE: use string ref "dojo.CWE" (CWE moves to finding lower in stack). +- tool_config: move ToolConfigForm_Admin (ModelForm) + Tool_Configuration_Admin (ModelAdmin) + from models.py into dojo/tool_config/admin.py. +- user (Bundle A): Dojo_User is an FK target everywhere — land it first; string-ref consumers. + +## Commit/PR conventions +- One commit per phase group, same style as existing stack (e.g. + "refactor(): extract API layer into dojo//api/ [ Phase 6,7,8,9]"). +- Every PR body (all 10) must carry the 10-item stack map already on #14970–#14974; copy that + block, set the "◀ this PR" marker. Summary section only — NO test-plan section. Read existing + body first (gh pr view --json body -q '.body'); edit via --body-file or + gh api -X PATCH /repos/DefectDojo/django-DefectDojo/pulls/ --input payload.json; verify after. +- After the 5 new draft PRs exist, BACKFILL their real PR numbers into all 10 PR bodies + (replace the "_draft, to be opened_" placeholders for items 6–10). + +## Stack hygiene +- If you edit a lower branch after upper ones exist, cascade: + git rebase --onto up the chain, then force-push all + with --force-with-lease to upstream. +- Use --no-track when creating branches. + +Work bottom-up, ONE module at a time. Stop and report if any verify gate fails and the fix +isn't mechanical. Narrate each module's pass/fail per CLAUDE.local.md. diff --git a/scripts/run-nuclei-docker.sh b/scripts/run-nuclei-docker.sh new file mode 100644 index 00000000000..6a430e8ef9c --- /dev/null +++ b/scripts/run-nuclei-docker.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Run ProjectDiscovery Nuclei in Docker with a persistent templates volume and rate limiting. +# Usage: ./scripts/run-nuclei-docker.sh [nuclei-extra-args...] +# Env: RATE_LIMIT, TEMPLATES_DIR, NUCLEI_IMAGE, UPDATE_TEMPLATES + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [nuclei-extra-args...]" >&2 + exit 1 +fi + +TARGET="$1" +shift + +RATE_LIMIT="${RATE_LIMIT:-150}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEMPLATES_DIR="${TEMPLATES_DIR:-${SCRIPT_DIR}/.nuclei-templates}" +IMAGE="${NUCLEI_IMAGE:-projectdiscovery/nuclei:latest}" +# Set to "true" to refresh templates before the scan (slower; use periodically). +UPDATE_TEMPLATES="${UPDATE_TEMPLATES:-false}" + +mkdir -p "${TEMPLATES_DIR}" + +DOCKER_RUN=( + docker run --rm + --pull always + -v "${TEMPLATES_DIR}:/root/nuclei-templates" + "${IMAGE}" + -ud /root/nuclei-templates +) + +if [[ "${UPDATE_TEMPLATES}" == "true" ]]; then + echo "Updating Nuclei templates in ${TEMPLATES_DIR} ..." + "${DOCKER_RUN[@]}" -update-templates + echo "" +fi + +echo "Scanning ${TARGET} (rate limit: ${RATE_LIMIT} req/s) ..." +"${DOCKER_RUN[@]}" -u "${TARGET}" -rate-limit "${RATE_LIMIT}" "$@" diff --git a/tagoulus-replcement-options.md b/tagoulus-replcement-options.md new file mode 100644 index 00000000000..795cb77aea1 --- /dev/null +++ b/tagoulus-replcement-options.md @@ -0,0 +1,319 @@ +# Tagulous Replacement Options + +## Context + +DefectDojo uses [django-tagulous](https://github.com/radiac/django-tagulous) (forked locally at `/home/valentijn/django-tagulous/`) for tagging on 9 models. Library is unmaintained upstream. Local fork carries 1 commit (Django 5.2 deserializer monkeypatch). Performance is the primary pain point — signal-driven count maintenance and per-row M2M sync on every save. + +## Current State + +### Tagulous usage in DefectDojo + +- **9 TagFields**, all flat, all `force_lowercase=True`. No trees. No `SingleTagField`. No `max_count`. +- Models: `Product`, `Engagement`, `Test`, `Finding`, `Endpoint`, `FindingTemplate`, `AppAnalysis`, `Objects_Product`, `Location`. +- **`tag.count`** unused outside tests. Pure overhead. +- **`_inherited_tag_names` JSONField** already replaced 5 inherited M2Ms in migration `0265`. Pattern proven internally. +- **`dojo/tag_utils.py`** (526 LOC) bypasses tagulous internals for bulk ops. Depends on `tag_options`, `related_model`, `remote_field.through`. +- **9 tag tables + 9 through-tables + 5 JSON columns** active. + +### Tagulous internals (perf hotspots) + +- **Signals fire on every write**: `pre_save`, `post_save`, `pre_delete`, `post_delete`. `post_save` always calls `reload()` + `add()`/`remove()` even when tags unchanged. +- **`update_count()`**: full requery per tag on raw deserialization. +- **Increment/decrement** uses F-expressions per tag, then refreshes. +- **No setting** to disable count maintenance. +- **Library size**: ~4500 LOC total, ~1800 LOC core. Vendorable. + +### Features used vs unused + +| Feature | Used? | +|---|---| +| Flat M2M tags | Yes | +| `force_lowercase` | Yes | +| Custom DRF parsing (comma/quoted) | Yes | +| Autocomplete widget | Yes (admin + UI) | +| Tag trees (`tree=True`) | No | +| `SingleTagField` | No | +| `max_count` | No | +| `protect_initial` / `protect_all` | No | +| `tag.count` (public) | No | +| Merge/rename admin | Minimal | + +## Options Ranked + +### Option A — JSONField only (drop M2M entirely) — best long-term + +Store `tags = JSONField(default=list)` per object. Index via GIN (Postgres) or functional index. Lookups via `tags__contains=[name]`. + +**Pros** +- No through-tables. No tag table. No signals. No count drift. +- Insert/update = single column write. Bulk = `bulk_update`. +- Matches doc-store model. Inheritance is already JSON. +- GIN-backed lookup fast on Postgres. +- Eliminates 9 tag tables + 9 through-tables. Major schema simplification. + +**Cons** +- No central tag registry. Autocomplete needs `DISTINCT jsonb_array_elements` or small `Tag(name)` registry refreshed async. +- Rename tag = `UPDATE WHERE tags @> ['old']` per affected table. Heavier than tagulous merge. +- MySQL/MariaDB JSON perf weaker than Postgres. Check support matrix (DefectDojo supports both). +- Large data migration: 9 fields × all rows. Batched backfill required. +- Serializer/form/widget rewrite. Lose autocomplete widget unless rebuilt. +- Breaks consumers assuming M2M shape (admin filters, pghistory proxies, external plugins). + +### Option B — Vendor tagulous, gut it — best short-term + +Copy `tagulous/` into `dojo/tagulous/`. Strip: +- `count` field + all `increment`/`decrement`/`try_delete`/`update_count` +- tree code (`TagTreeModel`, ~150 LOC) +- `pre_delete`/`post_delete` for `SingleTagField` (unused) +- raw=True `update_count` loop (signal hotspot) +- `protect_initial`/`protect_all` paths (unused) + +Keep: parser, manager, descriptor, autocomplete widget, migration ops. + +**Pros** +- ~50% LOC reduction. Owned. No upstream coupling. +- Removes biggest perf hits (count maintenance, raw deserialization recount). +- Migrations stay valid (same model shape). Zero data migration. +- `dojo/tag_utils.py` keeps working unchanged. +- Unblocks Django 5.2+ permanently. + +**Cons** +- Still M2M-shaped. Per-write signal overhead still exists, just lighter. +- ~2000 LOC owned by team. +- Doesn't fix root cause (signal-driven sync). Trims fat only. + +### Option C — Hybrid: JSON denorm + keep M2M + +Add `tag_names JSONField` alongside existing M2M. Reads/serializers/filters use JSON. M2M kept for admin/legacy. + +**Pros**: incremental. Low risk per step. Can drop M2M later. + +**Cons**: data duplication on writes. Two sources of truth = sync bugs. Worst-of-both during transition. + +### Option D — Keep fork as-is, add knobs + +Add to fork: `count_disabled=True` option, batch signal mode, skip-redundant-add path. + +**Pros**: minimal change. + +**Cons**: still upstream-shaped library no one else maintains. `tag_utils.py` still depends on internals. Doesn't solve root issue. + +### Option E — Build new lib ("if we built it today") + +Tiny library `django-jsontags`: +- `JSONTagField` = JSONField subclass +- Validators (lowercase, charset, max length) +- Manager-like `.add()/.remove()/.set()` via descriptor +- Lookup helpers wrapping `__contains` for cross-DB compat +- Optional async-refreshed `Tag(name, last_seen)` registry for autocomplete +- Postgres GIN auto-index hint +- DRF field + Form widget (Select2 fed by registry) +- ~500 LOC. No signals. No counts. No trees. + +Open-source as replacement for tagulous. Solves real ecosystem pain. + +## Recommendation + +**Two-step plan**: + +### Step 1 (now, 1–2 sprints): Option B + +Vendor tagulous into `dojo/tagulous/`, strip count + tree + raw-recount + unused protect paths. Removes signal hotspots without data migration. Unblocks Django 5.2+. Keeps `tag_utils.py` working. Low risk. + +Deliverables: +- `dojo/tagulous/` with stripped code +- Pin replaces `tagulous` import path +- Drop `count` column via single migration +- Benchmark: import workflow + bulk-tag ops before/after + +### Step 2 (next quarter): Option A + +Migrate to JSONField, model by model, starting with **Endpoint** (highest write volume from imports) then **Finding** (largest table). Per model: + +1. Add `tag_names JSONField` + GIN index (Postgres) / functional index (MySQL) +2. Backfill in batches via management command +3. Switch reads to JSON (services, serializers, filters, search) +4. Switch writes to JSON via service layer (`tag_utils.py` becomes thin wrapper) +5. Validate parity for one release +6. Drop M2M + tag table + +### Step 3 (optional, after dogfooding): Option E + +Extract `django-jsontags` library from internal code. Free byproduct of step 2. Open-source as tagulous successor. + +## Key Risks + +- **DB matrix**: confirm Postgres + MySQL JSON ops. DefectDojo supports both. Test on MariaDB minimum supported version. +- **Autocomplete UX**: registry table refresh strategy must be designed before any model flips. +- **Audit log (pghistory)**: tags currently tracked via through-table proxy models. JSON column tracks differently. Verify history continuity. +- **External integrations**: any plugin/parser/API consumer assuming M2M shape breaks. Grep `dojo_pro` + external projects. +- **Search ranking**: full-text/Watson over tag names — confirm GIN-backed `@>` matches current ranking behavior. +- **Admin filters**: Django admin `list_filter` on tags loses M2M widget. Replace or drop. +- **Bulk-import perf regression**: validate `tag_utils.py` paths still beat naive M2M during step 1, before step 2 begins. + +## Open Questions + +- Drop MariaDB/MySQL support tier for tag-heavy queries, or build cross-DB lookup helper? +- Move tag registry refresh to Celery task or DB trigger? +- Keep merge/rename admin UI or accept SQL-level rename procedure? +- Include `Tag.description`/`Tag.color` metadata fields in registry, or keep tags pure strings? + +--- + +## Addendum — Design notes from follow-up review + +### `raw=True` on `post_save` + +Django passes `raw=True` when the save originates from `loaddata` (fixture/dump deserialization). Tagulous treats this as "data injected directly into DB, internal counters stale" and calls `tag.update_count()` per tag — full `COUNT(*)` requery for every tag on every fixture-loaded row. Catastrophic on large dumps. Stripping this branch is one of the cheap wins in Option B. + +### Field-change detection (Option A signal) + +Codebase already uses `from fieldsignals import pre_save_changed` (see [dojo/finding/helper.py:16](dojo/finding/helper.py#L16)). No need for `django-model-utils` `FieldTracker` or self-rolled mixin. + +Hook shape: + +```python +from fieldsignals import pre_save_changed + +@receiver(pre_save_changed, sender=Finding, fields=['tags']) +def on_tags_changed(sender, instance, changed_fields, **kwargs): + field = Finding._meta.get_field('tags') + old, new = changed_fields[field] + added = set(new or []) - set(old or []) + removed = set(old or []) - set(new or []) + ... +``` + +### Batch context manager + +Mirror existing `dojo/tag_inheritance.py:batch()` pattern. Thread-local flag suppresses dispatch, accumulates diff, flushes on exit: + +```python +_batch_state = threading.local() + +@contextmanager +def batch_tag_writes(): + _batch_state.active = True + _batch_state.added = set() + _batch_state.removed = set() + try: + yield + finally: + added, removed = _batch_state.added, _batch_state.removed + _batch_state.active = False + if added or removed: + transaction.on_commit(lambda: registry_upsert.delay(list(added), list(removed))) +``` + +Importer wraps bulk loop in `with batch_tag_writes():`. Single dispatch instead of N. + +### Edge cases for any signal-driven path + +- **`bulk_update`/`bulk_create` skip signals.** Bulk paths must call registry update explicitly. `dojo/tag_utils.py` is the right seam — already centralizes bulk. +- **Queryset `.update()` skips signals too.** Same fix. +- **Transaction rollback**: always use `transaction.on_commit(...)` so dispatched task only fires on committed write. Critical to avoid registry pollution from rolled-back imports. +- **Race vs nightly GC**: GC must check current state at run-time, not trust `last_seen` alone. Otherwise concurrent writes lose entries. + +--- + +## Tag Registry — three implementation flavors + +The "registry" = table backing autocomplete and (optionally) per-tag counts. Three valid shapes, ranked. + +### Reg-1 — Maintained table + signals + +`Tag(name PK, last_seen)` updated by `pre_save_changed` signal. Nightly GC removes orphans. + +- **Pros**: cheap read (`name ILIKE 'crit%' LIMIT 20`), realtime visibility of new tags. +- **Cons**: drift under bulk paths, signal plumbing, batch hooks, `on_commit`, GC job. Same maintenance disease as `tag.count` today, in miniature. + +### Reg-2 — On-demand DISTINCT (no registry) + +```sql +SELECT DISTINCT jsonb_array_elements_text(tags) AS name +FROM finding +ORDER BY name LIMIT 20; +``` + +UNION across tagged tables for global autocomplete. + +- **Pros**: zero maintenance. Always current. No signals. +- **Cons**: GIN index does **not** accelerate `DISTINCT jsonb_array_elements_text` — it accelerates containment, not distinct extraction. Needs expression index or `pg_trgm` on extracted values. Slow on Finding/Endpoint without help. +- **Verdict**: viable for small tables only (Product, Engagement). Not for hot tables. + +### Reg-3 — Materialized view ★ recommended + +```sql +CREATE MATERIALIZED VIEW dojo_tag_names AS +SELECT DISTINCT name FROM ( + SELECT jsonb_array_elements_text(tags) AS name FROM finding + UNION + SELECT jsonb_array_elements_text(tags) FROM product + UNION ... +) t; +CREATE UNIQUE INDEX ON dojo_tag_names(name); +CREATE INDEX ON dojo_tag_names USING gin(name gin_trgm_ops); +``` + +Refresh `CONCURRENTLY` on cron (every 5 min). Optional write-path fast-path: `INSERT ... ON CONFLICT DO NOTHING` on tag-create so new tags autocomplete immediately. View still rebuilds nightly to GC removed names. + +- **Pros**: zero signal coupling. Fast autocomplete (trigram). Converges. Single bg job. No per-write maintenance. +- **Cons**: stale window for *removed* tags (acceptable — autocomplete tolerates ghosts). Postgres-only. +- **MariaDB equivalent**: regular table + scheduled `INSERT ... ON DUPLICATE KEY UPDATE` from `SELECT DISTINCT JSON_EXTRACT(...)`. Same shape, manual refresh. + +### Recommendation + +**Reg-3** with optional insert-only fast-path on tag create. Reg-1 only wins if realtime *removal* visible in autocomplete is required (it isn't). Reg-2 only wins on tiny tables. + +--- + +## Counts later (if/when needed) — three flavors + +Same lesson as registry: don't bring M2M back. Counts fit in registry table or a derived view. + +### Count-1 — Incremental column on registry + +`ref_count INTEGER` updated by signal diff. Atomic `F('ref_count') ± 1`. + +- **Pros**: O(diff) per write. Realtime. +- **Cons**: drifts under bulk + rollback. Same maintenance disease as today's `tag.count`. + +### Count-2 — On-demand recount + +```sql +SELECT COUNT(*) FROM finding WHERE tags @> '["critical"]'::jsonb; +``` + +GIN index makes containment fast even on 1M rows. + +- **Pros**: always correct. No coupling. +- **Cons**: O(matching rows) per query unless cached. + +### Count-3 — Materialized view ★ recommended if counts ever needed + +```sql +CREATE MATERIALIZED VIEW dojo_tag_counts AS +SELECT 'finding' AS scope, tag, COUNT(*) AS n +FROM finding, jsonb_array_elements_text(tags) AS tag +GROUP BY tag +UNION ALL ...; +``` + +Refresh nightly or on demand. + +- **Pros**: correct, fast read, no coupling, multi-scope in one query. +- **Cons**: stale until refresh. Postgres CONCURRENTLY needs unique index. MariaDB needs equivalent table + cron. + +### Recommendation + +Start with **no count column at all**. `tag.count` was unused for years. If a real consumer appears (tag cloud, reports), add **Count-3**. Skip Count-1 — its maintenance cost is exactly what Option A is escaping. + +--- + +## Updated decision summary + +- **Field-change detection**: `fieldsignals.pre_save_changed`. Already in tree. +- **Registry**: Reg-3 (materialized view + optional insert-only fast-path). +- **Counts**: skip until proven need. Then Count-3. +- **Batch path**: thread-local context manager + `transaction.on_commit` + explicit hooks in `tag_utils.py` bulk methods. +- **M2M not needed for any of the above.** diff --git a/unittests/test_async_delete.py b/unittests/test_async_delete.py index ae0105791e7..b7c2b5413a4 100644 --- a/unittests/test_async_delete.py +++ b/unittests/test_async_delete.py @@ -27,21 +27,21 @@ class TestAsyncDelete(DojoTestCase): """ Test async_delete functionality with dojo_dispatch_task kwargs injection. - These tests use import_execution_mode="sync" and crum.impersonate to run tasks synchronously, + These tests use deduplication_execution_mode="sync" and crum.impersonate to run tasks synchronously, which allows errors to surface immediately rather than being lost in background workers. """ def setUp(self): - """Set up test user with import_execution_mode="sync" and disable unneeded features.""" + """Set up test user with deduplication_execution_mode="sync" and disable unneeded features.""" super().setUp() - # Create test user with import_execution_mode="sync" to run tasks synchronously + # Create test user with deduplication_execution_mode="sync" to run tasks synchronously self.testuser = User.objects.create( username="test_async_delete_user", is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") # Log in as the test user (for API client) self.client.force_login(self.testuser) diff --git a/unittests/test_cascade_delete.py b/unittests/test_cascade_delete.py index 8f8a4244610..ab86c942b43 100644 --- a/unittests/test_cascade_delete.py +++ b/unittests/test_cascade_delete.py @@ -37,7 +37,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_celery_dispatch_force_async.py b/unittests/test_celery_dispatch_force_async.py index 7cf523d78d2..d0479a02d7b 100644 --- a/unittests/test_celery_dispatch_force_async.py +++ b/unittests/test_celery_dispatch_force_async.py @@ -3,7 +3,7 @@ `force_async=True` is for callers (e.g. the watson async indexer middleware) that should always run their celery task in the background even when the -current user has `import_execution_mode="sync"` or the caller passes `force_sync=True`. +current user has `deduplication_execution_mode="sync"` or the caller passes `force_sync=True`. """ from unittest.mock import patch diff --git a/unittests/test_deduplication_logic.py b/unittests/test_deduplication_logic.py index e1f5e08d05e..1c8ed8773ab 100644 --- a/unittests/test_deduplication_logic.py +++ b/unittests/test_deduplication_logic.py @@ -162,7 +162,7 @@ class TestDuplicationLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_duplication_loops.py b/unittests/test_duplication_loops.py index fabeb376de0..4c4616dafc4 100644 --- a/unittests/test_duplication_loops.py +++ b/unittests/test_duplication_loops.py @@ -19,7 +19,7 @@ class TestDuplicationLoops(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_false_positive_history_logic.py b/unittests/test_false_positive_history_logic.py index bc8ebb56641..9e4b7840fb4 100644 --- a/unittests/test_false_positive_history_logic.py +++ b/unittests/test_false_positive_history_logic.py @@ -131,7 +131,7 @@ class TestFalsePositiveHistoryLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # Unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_finding_helper.py b/unittests/test_finding_helper.py index 2e50424719b..d9fb2166d4b 100644 --- a/unittests/test_finding_helper.py +++ b/unittests/test_finding_helper.py @@ -256,7 +256,7 @@ def setUp(self): super().setUp() self.system_settings(enable_jira=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.import_execution_mode = "sync" + self.testuser.usercontactinfo.deduplication_execution_mode = "sync" self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_finding_model.py b/unittests/test_finding_model.py index 4d2cb4b2761..0e210f6159f 100644 --- a/unittests/test_finding_model.py +++ b/unittests/test_finding_model.py @@ -558,7 +558,7 @@ class TestFindingSLAExpiration(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py index a9232907df5..8c2bd4080ad 100644 --- a/unittests/test_import_execution_mode.py +++ b/unittests/test_import_execution_mode.py @@ -4,9 +4,9 @@ from dojo.importers.default_importer import DefaultImporter from dojo.models import ( - IMPORT_EXECUTION_MODE_ASYNC, - IMPORT_EXECUTION_MODE_ASYNC_WAIT, - IMPORT_EXECUTION_MODE_SYNC, + DEDUPLICATION_EXECUTION_MODE_ASYNC, + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, + DEDUPLICATION_EXECUTION_MODE_SYNC, Development_Environment, Dojo_User, Engagement, @@ -20,7 +20,7 @@ class ImportExecutionModeResolverTest(DojoTestCase): - """resolve_import_execution_mode: request override > profile > default.""" + """resolve_deduplication_execution_mode: request override > profile > default.""" fixtures = ["dojo_testdata.json"] @@ -31,38 +31,38 @@ def setUp(self): def _set_profile(self, *, mode=None): UserContactInfo.objects.update_or_create( user=self.user, - defaults={"import_execution_mode": mode}, + defaults={"deduplication_execution_mode": mode}, ) self.user.refresh_from_db() def test_default_is_async(self): - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user)) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, Dojo_User.resolve_deduplication_execution_mode(self.user)) def test_request_override_wins_over_profile(self): - self._set_profile(mode=IMPORT_EXECUTION_MODE_SYNC) + self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_SYNC) self.assertEqual( - IMPORT_EXECUTION_MODE_ASYNC_WAIT, - Dojo_User.resolve_import_execution_mode(self.user, IMPORT_EXECUTION_MODE_ASYNC_WAIT), + DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, + Dojo_User.resolve_deduplication_execution_mode(self.user, DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT), ) def test_profile_mode_used_when_no_override(self): - self._set_profile(mode=IMPORT_EXECUTION_MODE_ASYNC_WAIT) - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC_WAIT, Dojo_User.resolve_import_execution_mode(self.user)) + self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, Dojo_User.resolve_deduplication_execution_mode(self.user)) def test_empty_profile_falls_back_to_async(self): self._set_profile(mode=None) - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user)) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, Dojo_User.resolve_deduplication_execution_mode(self.user)) def test_invalid_override_ignored(self): - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(self.user, "garbage")) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, Dojo_User.resolve_deduplication_execution_mode(self.user, "garbage")) def test_no_user(self): - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, Dojo_User.resolve_import_execution_mode(None)) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, Dojo_User.resolve_deduplication_execution_mode(None)) def test_wants_block_execution_only_for_sync_mode(self): - self._set_profile(mode=IMPORT_EXECUTION_MODE_SYNC) + self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_SYNC) self.assertTrue(Dojo_User.wants_block_execution(self.user)) - self._set_profile(mode=IMPORT_EXECUTION_MODE_ASYNC_WAIT) + self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT) self.assertFalse(Dojo_User.wants_block_execution(self.user)) self._set_profile(mode=None) self.assertFalse(Dojo_User.wants_block_execution(self.user)) @@ -70,7 +70,7 @@ def test_wants_block_execution_only_for_sync_mode(self): class ImporterDispatchKwargsTest(DojoTestCase): - """import_execution_mode -> dojo_dispatch_task force flags.""" + """deduplication_execution_mode -> dojo_dispatch_task force flags.""" fixtures = ["dojo_testdata.json"] @@ -79,34 +79,34 @@ def _importer(self, mode, **extra): scan_type="ZAP Scan", engagement=Engagement.objects.first(), environment=Development_Environment.objects.first(), - import_execution_mode=mode, + deduplication_execution_mode=mode, **extra, ) def test_sync_mode_forces_sync(self): - self.assertEqual({"force_sync": True}, self._importer(IMPORT_EXECUTION_MODE_SYNC).post_processing_dispatch_kwargs()) + self.assertEqual({"force_sync": True}, self._importer(DEDUPLICATION_EXECUTION_MODE_SYNC).post_processing_dispatch_kwargs()) def test_async_wait_mode_forces_async(self): - self.assertEqual({"force_async": True}, self._importer(IMPORT_EXECUTION_MODE_ASYNC_WAIT).post_processing_dispatch_kwargs()) + self.assertEqual({"force_async": True}, self._importer(DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT).post_processing_dispatch_kwargs()) def test_async_mode_preserves_external_force_sync(self): - importer = self._importer(IMPORT_EXECUTION_MODE_ASYNC) + importer = self._importer(DEDUPLICATION_EXECUTION_MODE_ASYNC) self.assertEqual({"force_sync": False}, importer.post_processing_dispatch_kwargs()) self.assertEqual({"force_sync": True}, importer.post_processing_dispatch_kwargs(force_sync=True)) def test_invalid_mode_defaults_to_async(self): - self.assertEqual(IMPORT_EXECUTION_MODE_ASYNC, self._importer("nonsense").import_execution_mode) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, self._importer("nonsense").deduplication_execution_mode) def test_external_force_sync_promotes_to_sync_mode(self): - importer = self._importer(IMPORT_EXECUTION_MODE_ASYNC, force_sync=True) - self.assertEqual(IMPORT_EXECUTION_MODE_SYNC, importer.import_execution_mode) + importer = self._importer(DEDUPLICATION_EXECUTION_MODE_ASYNC, force_sync=True) + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_SYNC, importer.deduplication_execution_mode) @override_settings(CELERY_TASK_ALWAYS_EAGER=True) class ImportExecutionModeAPITest(DojoAPITestCase): """ - End-to-end: the import endpoints accept and honor import_execution_mode. + End-to-end: the import endpoints accept and honor deduplication_execution_mode. CELERY_TASK_ALWAYS_EAGER runs dispatched tasks inline against the test DB, so 'async_wait' can actually join its deduplication batch (a real broker/worker @@ -124,12 +124,12 @@ def _payload(self, mode): "minimum_severity": "Low", "scan_type": "ZAP Scan", "engagement": 1, - "import_execution_mode": mode, + "deduplication_execution_mode": mode, } def test_import_async_wait_returns_statistics(self): with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: - payload = self._payload(IMPORT_EXECUTION_MODE_ASYNC_WAIT) + payload = self._payload(DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT) payload["file"] = testfile result = self.import_scan(payload, 201) self.assertIn("statistics", result) @@ -139,7 +139,7 @@ def test_import_async_wait_returns_statistics(self): def test_import_async_does_not_await_deduplication(self): with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: - payload = self._payload(IMPORT_EXECUTION_MODE_ASYNC) + payload = self._payload(DEDUPLICATION_EXECUTION_MODE_ASYNC) payload["file"] = testfile result = self.import_scan(payload, 201) self.assertFalse(result["deduplication_complete"]) diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 26c70f2a52e..486d4e4eef2 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -2323,7 +2323,7 @@ def __init__(self, *args, **kwargs): def setUp(self): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') @@ -3122,7 +3122,7 @@ def __init__(self, *args, **kwargs): def setUp(self): # still using the API to verify results testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') diff --git a/unittests/test_importers_deduplication.py b/unittests/test_importers_deduplication.py index 0bd2a71568c..f4c692c7101 100644 --- a/unittests/test_importers_deduplication.py +++ b/unittests/test_importers_deduplication.py @@ -37,7 +37,7 @@ def setUp(self): testuser.is_superuser = True testuser.is_staff = True testuser.save() - UserContactInfo.objects.create(user=testuser, import_execution_mode="sync") + UserContactInfo.objects.create(user=testuser, deduplication_execution_mode="sync") # Authenticate API client as admin for import endpoints self.login_as_admin() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 1d011c747bb..3fbe7cb84a4 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -67,7 +67,7 @@ def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.update_or_create(user=testuser, defaults={"import_execution_mode": None}) + UserContactInfo.objects.update_or_create(user=testuser, defaults={"deduplication_execution_mode": None}) self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) @@ -363,7 +363,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self._import_reimport_performance( @@ -387,7 +387,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -541,7 +541,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self._deduplication_performance( @@ -653,7 +653,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self._import_reimport_performance( @@ -677,7 +677,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -805,7 +805,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self._deduplication_performance( diff --git a/unittests/test_jira_config_engagement.py b/unittests/test_jira_config_engagement.py index 7b8beb6e8ea..ca730fde0de 100644 --- a/unittests/test_jira_config_engagement.py +++ b/unittests/test_jira_config_engagement.py @@ -252,7 +252,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.import_execution_mode = "sync" + self.user.usercontactinfo.deduplication_execution_mode = "sync" self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_config_engagement_epic.py b/unittests/test_jira_config_engagement_epic.py index 6f314b9748c..a77d005839a 100644 --- a/unittests/test_jira_config_engagement_epic.py +++ b/unittests/test_jira_config_engagement_epic.py @@ -39,7 +39,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.import_execution_mode = "sync" + self.user.usercontactinfo.deduplication_execution_mode = "sync" self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_import_and_pushing_api.py b/unittests/test_jira_import_and_pushing_api.py index 7d472633363..89135bfada4 100644 --- a/unittests/test_jira_import_and_pushing_api.py +++ b/unittests/test_jira_import_and_pushing_api.py @@ -73,7 +73,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.system_settings(enable_webhooks_notifications=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.import_execution_mode = "sync" + self.testuser.usercontactinfo.deduplication_execution_mode = "sync" self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_notifications.py b/unittests/test_notifications.py index b9153ada311..1ff9adb2d07 100644 --- a/unittests/test_notifications.py +++ b/unittests/test_notifications.py @@ -641,9 +641,9 @@ class TestAsyncNotificationTaskBody(DojoTestCase): def run(self, result=None): # Same sync pattern used by TestNotificationWebhooks: run under an impersonated user - # with import_execution_mode="sync" so downstream dojo_dispatch_task calls execute inline. + # with deduplication_execution_mode="sync" so downstream dojo_dispatch_task calls execute inline. testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() with impersonate(testuser): super().run(result) @@ -677,8 +677,8 @@ def test_async_task_returns_silently_on_missing_instance(self, mock_process): @patch("dojo.notifications.helper.create_notification") def test_dispatch_respects_block_execution(self, mock_create): - """With import_execution_mode="sync" on the impersonated user, the post_save signal runs the task body inline.""" - # The run() wrapper impersonates admin with import_execution_mode="sync", so + """With deduplication_execution_mode="sync" on the impersonated user, the post_save signal runs the task body inline.""" + # The run() wrapper impersonates admin with deduplication_execution_mode="sync", so # dojo_dispatch_task takes the sync branch and the task body calls create_notification # (module-level helper inside async_create_notification) synchronously. prod = Product.objects.first() @@ -705,7 +705,7 @@ def run(self, result=None): if getattr(self, "__unittest_skip__", False): return super().run(result) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_prepare_duplicates_for_delete.py b/unittests/test_prepare_duplicates_for_delete.py index 2981abe796b..e89c5fadd33 100644 --- a/unittests/test_prepare_duplicates_for_delete.py +++ b/unittests/test_prepare_duplicates_for_delete.py @@ -45,7 +45,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_product_grading.py b/unittests/test_product_grading.py index 4f929233b81..7d3080dcee5 100644 --- a/unittests/test_product_grading.py +++ b/unittests/test_product_grading.py @@ -12,7 +12,7 @@ class ProductGradeTest(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index fb6bff558c2..207ed54394d 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -72,7 +72,7 @@ class ReimportDuplicateFindingsTestBase(DojoTestCase): def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.get_or_create(user=testuser, defaults={"import_execution_mode": "sync"}) + UserContactInfo.objects.get_or_create(user=testuser, defaults={"deduplication_execution_mode": "sync"}) self.system_settings(enable_deduplication=True) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_tag_inheritance.py b/unittests/test_tag_inheritance.py index ba9ac82adda..40007bee800 100644 --- a/unittests/test_tag_inheritance.py +++ b/unittests/test_tag_inheritance.py @@ -713,7 +713,7 @@ class InheritedTagsImportTestAPI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False @@ -731,7 +731,7 @@ class InheritedTagsImportTestUI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False diff --git a/unittests/test_tags.py b/unittests/test_tags.py index 533d9d356b1..6d35b2a87e9 100644 --- a/unittests/test_tags.py +++ b/unittests/test_tags.py @@ -387,7 +387,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() TagImportMixin.setUp(self) @@ -404,7 +404,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.import_execution_mode = "sync" + testuser.usercontactinfo.deduplication_execution_mode = "sync" testuser.usercontactinfo.save() self.login_as_admin() self.client_ui = Client() diff --git a/unittests/test_watson_async_search_index.py b/unittests/test_watson_async_search_index.py index fe54d61c0f8..916607b92a7 100644 --- a/unittests/test_watson_async_search_index.py +++ b/unittests/test_watson_async_search_index.py @@ -20,7 +20,7 @@ def setUp(self): super().setUp() self.testuser = User.objects.create(username="admin", is_staff=True, is_superuser=True) - UserContactInfo.objects.create(user=self.testuser, import_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) From 44af3b9376b2758f49e4fbf266bcc17610f1b079 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 15:07:52 +0200 Subject: [PATCH 05/29] refactor: rename DD_IMPORT_ASYNC_WAIT_TIMEOUT to DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT, default 60s --- dojo/importers/base_importer.py | 4 ++-- dojo/settings/settings.dist.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 13617794ca6..bc24538fd78 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -127,7 +127,7 @@ def wait_for_post_processing(self): returned statistics reflect the deduplicated state. Only relevant in the 'async_wait' execution mode; bounded by - settings.IMPORT_ASYNC_WAIT_TIMEOUT so a stuck/missing worker degrades + settings.DEDUPLICATION_ASYNC_WAIT_TIMEOUT so a stuck/missing worker degrades to the historical (respond-anyway) behavior instead of hanging. """ if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_SYNC: @@ -143,7 +143,7 @@ def wait_for_post_processing(self): # Nothing was dispatched (e.g. empty import) — dedup is trivially done. self.deduplication_complete = True return - timeout = getattr(settings, "IMPORT_ASYNC_WAIT_TIMEOUT", 120) + timeout = getattr(settings, "DEDUPLICATION_ASYNC_WAIT_TIMEOUT", 60) logger.debug("async_wait: waiting for %d post-processing task(s) (timeout=%ss)", len(results), timeout) success = True for result in results: diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py index 93361398f04..10f14d1627d 100644 --- a/dojo/settings/settings.dist.py +++ b/dojo/settings/settings.dist.py @@ -96,9 +96,9 @@ DD_CELERY_BROKER_PARAMS=(str, ""), DD_CELERY_BROKER_TRANSPORT_OPTIONS=(str, ""), DD_CELERY_TASK_IGNORE_RESULT=(bool, True), - # Max seconds the 'async_wait' import execution mode will wait for background - # deduplication/post-processing to finish before responding anyway. - DD_IMPORT_ASYNC_WAIT_TIMEOUT=(int, 120), + # Max seconds the 'async_wait' deduplication execution mode will wait for + # background deduplication/post-processing to finish before responding anyway. + DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT=(int, 60), DD_CELERY_RESULT_BACKEND=(str, "django-db"), DD_CELERY_RESULT_EXPIRES=(int, 86400), DD_CELERY_BEAT_SCHEDULE_FILENAME=(str, root("dojo.celery.beat.db")), @@ -867,7 +867,7 @@ def generate_url(scheme, double_slashes, user, password, host, port, path, param params=env("DD_CELERY_BROKER_PARAMS"), ) CELERY_TASK_IGNORE_RESULT = env("DD_CELERY_TASK_IGNORE_RESULT") -IMPORT_ASYNC_WAIT_TIMEOUT = env("DD_IMPORT_ASYNC_WAIT_TIMEOUT") +DEDUPLICATION_ASYNC_WAIT_TIMEOUT = env("DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT") CELERY_RESULT_BACKEND = env("DD_CELERY_RESULT_BACKEND") CELERY_TIMEZONE = TIME_ZONE CELERY_RESULT_EXPIRES = env("DD_CELERY_RESULT_EXPIRES") From 5ecf3429f775f5c036a8e70023571f7060589e32 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 15:34:36 +0200 Subject: [PATCH 06/29] refactor: keep block_execution as global switch, deduplication_execution_mode for import dedup block_execution gates every async task (notifications, jira, grading, deletes, reindex, ...) via we_want_async, so it must remain. Restore it as the global "run all async tasks in the foreground" flag and make deduplication_execution_mode an independent field that only controls how import/reimport deduplication post-processing is dispatched and awaited. - Restore the block_execution field; wants_block_execution() reads it again. - resolve_deduplication_execution_mode() falls back to block_execution -> sync. - Replace the destructive remove migration (0270) with a seed-only data migration that maps block_execution=True -> deduplication_execution_mode 'sync'; keep both. - Restore fixtures, the profile form field, and the user detail view (both fields). - Revert the test sweep: tests that need global foreground use block_execution=True again; the dedicated deduplication_execution_mode tests are updated for the split. --- dojo/celery_dispatch.py | 6 +-- ...9_usercontactinfo_import_execution_mode.py | 2 +- ..._remove_usercontactinfo_block_execution.py | 28 -------------- .../0270_seed_deduplication_execution_mode.py | 30 +++++++++++++++ ...ame_import_execution_mode_deduplication.py | 2 +- dojo/decorators.py | 6 +-- dojo/fixtures/defect_dojo_sample_data.json | 3 ++ .../defect_dojo_sample_data_locations.json | 3 ++ dojo/fixtures/dojo_testdata.json | 3 ++ dojo/fixtures/dojo_testdata_locations.json | 3 ++ dojo/middleware.py | 2 +- dojo/templates/dojo/view_user.html | 12 +++++- dojo/templates_classic/dojo/view_user.html | 12 +++++- dojo/user/models.py | 37 +++++++++++-------- dojo/user/ui/forms.py | 2 +- unittests/test_async_delete.py | 8 ++-- unittests/test_cascade_delete.py | 2 +- unittests/test_celery_dispatch_force_async.py | 2 +- unittests/test_deduplication_logic.py | 2 +- unittests/test_duplication_loops.py | 2 +- .../test_false_positive_history_logic.py | 2 +- unittests/test_finding_helper.py | 2 +- unittests/test_finding_model.py | 2 +- unittests/test_import_execution_mode.py | 29 ++++++++++++--- unittests/test_import_reimport.py | 4 +- unittests/test_importers_deduplication.py | 2 +- unittests/test_importers_performance.py | 14 +++---- unittests/test_jira_config_engagement.py | 2 +- unittests/test_jira_config_engagement_epic.py | 2 +- unittests/test_jira_import_and_pushing_api.py | 2 +- unittests/test_notifications.py | 10 ++--- .../test_prepare_duplicates_for_delete.py | 2 +- unittests/test_product_grading.py | 2 +- unittests/test_reimport_prefetch.py | 2 +- unittests/test_tag_inheritance.py | 4 +- unittests/test_tags.py | 4 +- unittests/test_watson_async_search_index.py | 2 +- 37 files changed, 157 insertions(+), 97 deletions(-) delete mode 100644 dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py create mode 100644 dojo/db_migrations/0270_seed_deduplication_execution_mode.py diff --git a/dojo/celery_dispatch.py b/dojo/celery_dispatch.py index 3171291d5ee..c4eee2aef08 100644 --- a/dojo/celery_dispatch.py +++ b/dojo/celery_dispatch.py @@ -62,10 +62,10 @@ def dojo_dispatch_task(task_or_sig: _SupportsSi | _SupportsApplyAsync | Signatur - Inject `async_user_id` if missing. - Capture and inject pghistory context if available. - Respect `force_sync=True` (foreground execution) and the user's - synchronous deduplication_execution_mode. + block_execution flag. - Respect `force_async=True` (background execution even when the caller - would otherwise run synchronously, e.g. user on the synchronous mode). - `force_async` wins over `force_sync` and the user's mode. + would otherwise run synchronously, e.g. user has block_execution). + `force_async` wins over `force_sync` and block_execution. - Support `countdown=` for async dispatch. Returns: diff --git a/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py b/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py index 220f0240b1a..d8219e5596c 100644 --- a/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py +++ b/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py @@ -11,6 +11,6 @@ class Migration(migrations.Migration): migrations.AddField( model_name='usercontactinfo', name='import_execution_mode', - field=models.CharField(blank=True, choices=[('async', 'Async (do not wait)'), ('async_wait', 'Async, wait for deduplication'), ('sync', 'Synchronous (block)')], help_text="Controls how import/reimport post-processing is executed. 'Async' returns immediately (default). 'Async, wait for deduplication' runs post-processing in the background but waits for deduplication to finish before responding, so notifications and statistics are accurate. 'Synchronous' runs everything inline (and blocks all async tasks in the foreground for this user, like the old 'block execution' flag). Can be overridden per request.", max_length=20, null=True), + field=models.CharField(blank=True, choices=[('async', 'Async (do not wait)'), ('async_wait', 'Async, wait for deduplication'), ('sync', 'Synchronous (block)')], help_text="Controls how import/reimport deduplication post-processing is executed. 'Async' dispatches it to the background and returns immediately (default). 'Async, wait for deduplication' dispatches to the background but waits for deduplication to finish before responding, so notifications and statistics reflect the deduplicated state. 'Synchronous' runs the import deduplication inline. Can be overridden per request. Independent of block_execution, which forces all async tasks (notifications, jira, ...) to the foreground.", max_length=20, null=True), ), ] diff --git a/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py b/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py deleted file mode 100644 index 56bba7e2886..00000000000 --- a/dojo/db_migrations/0270_remove_usercontactinfo_block_execution.py +++ /dev/null @@ -1,28 +0,0 @@ -from django.db import migrations - - -def block_execution_to_sync_mode(apps, schema_editor): - """Map the legacy block_execution=True flag to the synchronous import execution mode.""" - UserContactInfo = apps.get_model("dojo", "UserContactInfo") - UserContactInfo.objects.filter(block_execution=True).update(import_execution_mode="sync") - - -def sync_mode_to_block_execution(apps, schema_editor): - """Reverse: restore block_execution=True for users on the synchronous mode.""" - UserContactInfo = apps.get_model("dojo", "UserContactInfo") - UserContactInfo.objects.filter(import_execution_mode="sync").update(block_execution=True) - - -class Migration(migrations.Migration): - - dependencies = [ - ('dojo', '0269_usercontactinfo_import_execution_mode'), - ] - - operations = [ - migrations.RunPython(block_execution_to_sync_mode, sync_mode_to_block_execution), - migrations.RemoveField( - model_name='usercontactinfo', - name='block_execution', - ), - ] diff --git a/dojo/db_migrations/0270_seed_deduplication_execution_mode.py b/dojo/db_migrations/0270_seed_deduplication_execution_mode.py new file mode 100644 index 00000000000..862a2b0d929 --- /dev/null +++ b/dojo/db_migrations/0270_seed_deduplication_execution_mode.py @@ -0,0 +1,30 @@ +from django.db import migrations + + +def seed_deduplication_execution_mode(apps, schema_editor): + """ + Seed the new import deduplication execution mode from the legacy block_execution flag. + + block_execution remains the global "run all async tasks in the foreground" switch; + users who had it enabled get the synchronous deduplication mode so import behavior is + unchanged for them. + """ + UserContactInfo = apps.get_model("dojo", "UserContactInfo") + UserContactInfo.objects.filter(block_execution=True).update(import_execution_mode="sync") + + +def unseed_deduplication_execution_mode(apps, schema_editor): + """Reverse: clear the seeded synchronous mode.""" + UserContactInfo = apps.get_model("dojo", "UserContactInfo") + UserContactInfo.objects.filter(import_execution_mode="sync").update(import_execution_mode=None) + + +class Migration(migrations.Migration): + + dependencies = [ + ('dojo', '0269_usercontactinfo_import_execution_mode'), + ] + + operations = [ + migrations.RunPython(seed_deduplication_execution_mode, unseed_deduplication_execution_mode), + ] diff --git a/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py b/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py index 51812be16d3..1b136c9050b 100644 --- a/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py +++ b/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py @@ -4,7 +4,7 @@ class Migration(migrations.Migration): dependencies = [ - ('dojo', '0270_remove_usercontactinfo_block_execution'), + ('dojo', '0270_seed_deduplication_execution_mode'), ] operations = [ diff --git a/dojo/decorators.py b/dojo/decorators.py index d938c6e2661..cbe33de732d 100644 --- a/dojo/decorators.py +++ b/dojo/decorators.py @@ -76,15 +76,15 @@ def we_want_async(*args, func=None, **kwargs): return True if Dojo_User.wants_block_execution(user): - logger.debug("dojo_async_task %s: running task in the foreground as deduplication_execution_mode is 'sync' for %s", func, user) + logger.debug("dojo_async_task %s: running task in the foreground as block_execution is set to True for %s", func, user) return False - logger.debug("dojo_async_task %s: running task in the background as deduplication_execution_mode is not 'sync' for %s", func, user) + logger.debug("dojo_async_task %s: running task in the background as user has not set block_execution to True for %s", func, user) return True # Defect Dojo performs all tasks asynchrnonously using celery -# *unless* the user initiating the task has set deduplication_execution_mode to 'sync' in their usercontactinfo profile +# *unless* the user initiating the task has set block_execution to True in their usercontactinfo profile def dojo_async_task(func=None, *, signature=False): def decorator(func): @wraps(func) diff --git a/dojo/fixtures/defect_dojo_sample_data.json b/dojo/fixtures/defect_dojo_sample_data.json index 313f065093d..f4a44ab5459 100644 --- a/dojo/fixtures/defect_dojo_sample_data.json +++ b/dojo/fixtures/defect_dojo_sample_data.json @@ -687,6 +687,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -704,6 +705,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -721,6 +723,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, diff --git a/dojo/fixtures/defect_dojo_sample_data_locations.json b/dojo/fixtures/defect_dojo_sample_data_locations.json index c6d1fa39037..f338fa62344 100644 --- a/dojo/fixtures/defect_dojo_sample_data_locations.json +++ b/dojo/fixtures/defect_dojo_sample_data_locations.json @@ -687,6 +687,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -706,6 +707,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, @@ -725,6 +727,7 @@ }, { "fields": { + "block_execution": false, "cell_number": "", "force_password_reset": false, "github_username": null, diff --git a/dojo/fixtures/dojo_testdata.json b/dojo/fixtures/dojo_testdata.json index b1471612a4c..50707a2d2bf 100644 --- a/dojo/fixtures/dojo_testdata.json +++ b/dojo/fixtures/dojo_testdata.json @@ -277,6 +277,7 @@ "title": null, "twitter_username": "#admin", "user": 1, + "block_execution": false, "github_username": null } }, @@ -290,6 +291,7 @@ "title": null, "twitter_username": null, "user": 2, + "block_execution": false, "github_username": null } }, @@ -303,6 +305,7 @@ "title": null, "twitter_username": null, "user": 3, + "block_execution": false, "github_username": null } }, diff --git a/dojo/fixtures/dojo_testdata_locations.json b/dojo/fixtures/dojo_testdata_locations.json index 99e9e875953..3d4eb06ff9b 100644 --- a/dojo/fixtures/dojo_testdata_locations.json +++ b/dojo/fixtures/dojo_testdata_locations.json @@ -277,6 +277,7 @@ "title": null, "twitter_username": "#admin", "user": 1, + "block_execution": false, "github_username": null } }, @@ -290,6 +291,7 @@ "title": null, "twitter_username": null, "user": 2, + "block_execution": false, "github_username": null } }, @@ -303,6 +305,7 @@ "title": null, "twitter_username": null, "user": 3, + "block_execution": false, "github_username": null } }, diff --git a/dojo/middleware.py b/dojo/middleware.py index d9cdd626b1e..127ab462a09 100644 --- a/dojo/middleware.py +++ b/dojo/middleware.py @@ -281,7 +281,7 @@ def _drain_search_context_to_async(objects, source): for model_name, pk_list in model_groups.items(): batches = [pk_list[i:i + batch_size] for i in range(0, len(pk_list), batch_size)] # force_async=True keeps indexing off the request path even for users - # on the synchronous deduplication_execution_mode — index updates are slow and + # with block_execution=True — index updates are slow and # never need to be synchronous from the user's perspective. for i, batch in enumerate(batches, 1): logger.debug(f"{source}: Triggering batch {i}/{len(batches)} for {model_name}: {len(batch)} instances") diff --git a/dojo/templates/dojo/view_user.html b/dojo/templates/dojo/view_user.html index acc31927a48..75c30fd6edf 100644 --- a/dojo/templates/dojo/view_user.html +++ b/dojo/templates/dojo/view_user.html @@ -279,7 +279,17 @@

- {% trans "Import execution mode" %} + {% trans "Block execution" %} + + {% if user.usercontactinfo.block_execution %} + + {% else %} + + {% endif %} + + + + {% trans "Deduplication execution mode" %} {{ user.usercontactinfo.get_deduplication_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/templates_classic/dojo/view_user.html b/dojo/templates_classic/dojo/view_user.html index 238694e1c5f..ed9e90ee8cc 100644 --- a/dojo/templates_classic/dojo/view_user.html +++ b/dojo/templates_classic/dojo/view_user.html @@ -279,7 +279,17 @@

- {% trans "Import execution mode" %} + {% trans "Block execution" %} + + {% if user.usercontactinfo.block_execution %} + + {% else %} + + {% endif %} + + + + {% trans "Deduplication execution mode" %} {{ user.usercontactinfo.get_deduplication_execution_mode_display|default:_("Async (do not wait)") }} diff --git a/dojo/user/models.py b/dojo/user/models.py index 780d5c0b4ab..f762edf86ea 100644 --- a/dojo/user/models.py +++ b/dojo/user/models.py @@ -42,24 +42,30 @@ def __str__(self): @staticmethod def wants_block_execution(user): - # this returns False if there is no user, i.e. in celery processes, unittests, etc. - # The synchronous import execution mode is the successor of the old block_execution - # flag and governs whether async tasks run in the foreground for this user. - return hasattr(user, "usercontactinfo") and user.usercontactinfo.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_SYNC + # this return False if there is no user, i.e. in celery processes, unittests, etc. + # block_execution is the global "run all async tasks in the foreground" switch and + # governs every dojo_dispatch_task/dojo_async_task call (notifications, jira, grading, + # deduplication, ...). It is distinct from deduplication_execution_mode, which only + # controls how import/reimport deduplication post-processing is dispatched/awaited. + return hasattr(user, "usercontactinfo") and user.usercontactinfo.block_execution @staticmethod def resolve_deduplication_execution_mode(user, override=None): """ - Resolve the effective import post-processing execution mode. + Resolve the effective import/reimport deduplication execution mode. - Priority: explicit request override > user profile setting > default async. + Priority: explicit request override > user profile deduplication_execution_mode > + legacy block_execution (which forces everything sync) > default async. Returns one of DEDUPLICATION_EXECUTION_MODE_ASYNC / _ASYNC_WAIT / _SYNC. """ if override in DEDUPLICATION_EXECUTION_MODES: return override info = getattr(user, "usercontactinfo", None) - if info is not None and info.deduplication_execution_mode in DEDUPLICATION_EXECUTION_MODES: - return info.deduplication_execution_mode + if info is not None: + if info.deduplication_execution_mode in DEDUPLICATION_EXECUTION_MODES: + return info.deduplication_execution_mode + if info.block_execution: + return DEDUPLICATION_EXECUTION_MODE_SYNC return DEDUPLICATION_EXECUTION_MODE_ASYNC @staticmethod @@ -101,19 +107,20 @@ class UserContactInfo(models.Model): github_username = models.CharField(blank=True, null=True, max_length=150) slack_username = models.CharField(blank=True, null=True, max_length=150, help_text=_("Email address associated with your slack account"), verbose_name=_("Slack Email Address")) slack_user_id = models.CharField(blank=True, null=True, max_length=25) + block_execution = models.BooleanField(default=False, help_text=_("Instead of async deduping a finding the findings will be deduped synchronously and will 'block' the user until completion.")) deduplication_execution_mode = models.CharField( max_length=20, choices=DEDUPLICATION_EXECUTION_MODE_CHOICES, null=True, blank=True, help_text=_( - "Controls how import/reimport post-processing is executed. " - "'Async' returns immediately (default). 'Async, wait for deduplication' " - "runs post-processing in the background but waits for deduplication to " - "finish before responding, so notifications and statistics are accurate. " - "'Synchronous' runs everything inline (and blocks all async tasks in the " - "foreground for this user, like the old 'block execution' flag). Can be " - "overridden per request.", + "Controls how import/reimport deduplication post-processing is executed. " + "'Async' dispatches it to the background and returns immediately (default). " + "'Async, wait for deduplication' dispatches to the background but waits for " + "deduplication to finish before responding, so notifications and statistics " + "reflect the deduplicated state. 'Synchronous' runs the import deduplication " + "inline. Can be overridden per request. Independent of block_execution, which " + "forces all async tasks (notifications, jira, ...) to the foreground.", ), ) force_password_reset = models.BooleanField(default=False, help_text=_("Forces this user to reset their password on next login.")) diff --git a/dojo/user/ui/forms.py b/dojo/user/ui/forms.py index 751eec14b13..4affc20cc97 100644 --- a/dojo/user/ui/forms.py +++ b/dojo/user/ui/forms.py @@ -80,7 +80,7 @@ class Meta: # Swap order: password_last_reset before token_last_reset field_order = [ "title", "phone_number", "cell_number", "twitter_username", "github_username", - "slack_username", "ui_use_tailwind", "deduplication_execution_mode", "force_password_reset", "reset_api_token", + "slack_username", "ui_use_tailwind", "block_execution", "deduplication_execution_mode", "force_password_reset", "reset_api_token", "password_last_reset", "token_last_reset", ] diff --git a/unittests/test_async_delete.py b/unittests/test_async_delete.py index b7c2b5413a4..7aa8f0769a1 100644 --- a/unittests/test_async_delete.py +++ b/unittests/test_async_delete.py @@ -27,21 +27,21 @@ class TestAsyncDelete(DojoTestCase): """ Test async_delete functionality with dojo_dispatch_task kwargs injection. - These tests use deduplication_execution_mode="sync" and crum.impersonate to run tasks synchronously, + These tests use block_execution=True and crum.impersonate to run tasks synchronously, which allows errors to surface immediately rather than being lost in background workers. """ def setUp(self): - """Set up test user with deduplication_execution_mode="sync" and disable unneeded features.""" + """Set up test user with block_execution=True and disable unneeded features.""" super().setUp() - # Create test user with deduplication_execution_mode="sync" to run tasks synchronously + # Create test user with block_execution=True to run tasks synchronously self.testuser = User.objects.create( username="test_async_delete_user", is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, block_execution=True) # Log in as the test user (for API client) self.client.force_login(self.testuser) diff --git a/unittests/test_cascade_delete.py b/unittests/test_cascade_delete.py index ab86c942b43..f24667e44f8 100644 --- a/unittests/test_cascade_delete.py +++ b/unittests/test_cascade_delete.py @@ -37,7 +37,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, block_execution=True) self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_celery_dispatch_force_async.py b/unittests/test_celery_dispatch_force_async.py index d0479a02d7b..71170e1bb09 100644 --- a/unittests/test_celery_dispatch_force_async.py +++ b/unittests/test_celery_dispatch_force_async.py @@ -3,7 +3,7 @@ `force_async=True` is for callers (e.g. the watson async indexer middleware) that should always run their celery task in the background even when the -current user has `deduplication_execution_mode="sync"` or the caller passes `force_sync=True`. +current user has `block_execution=True` or the caller passes `force_sync=True`. """ from unittest.mock import patch diff --git a/unittests/test_deduplication_logic.py b/unittests/test_deduplication_logic.py index 1c8ed8773ab..fd6b5d2847c 100644 --- a/unittests/test_deduplication_logic.py +++ b/unittests/test_deduplication_logic.py @@ -162,7 +162,7 @@ class TestDuplicationLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_duplication_loops.py b/unittests/test_duplication_loops.py index 4c4616dafc4..a3a70cf5c2b 100644 --- a/unittests/test_duplication_loops.py +++ b/unittests/test_duplication_loops.py @@ -19,7 +19,7 @@ class TestDuplicationLoops(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_false_positive_history_logic.py b/unittests/test_false_positive_history_logic.py index 9e4b7840fb4..5975348e14e 100644 --- a/unittests/test_false_positive_history_logic.py +++ b/unittests/test_false_positive_history_logic.py @@ -131,7 +131,7 @@ class TestFalsePositiveHistoryLogic(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # Unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_finding_helper.py b/unittests/test_finding_helper.py index d9fb2166d4b..f6d9e747ff2 100644 --- a/unittests/test_finding_helper.py +++ b/unittests/test_finding_helper.py @@ -256,7 +256,7 @@ def setUp(self): super().setUp() self.system_settings(enable_jira=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.deduplication_execution_mode = "sync" + self.testuser.usercontactinfo.block_execution = True self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_finding_model.py b/unittests/test_finding_model.py index 0e210f6159f..136a5e0e5f8 100644 --- a/unittests/test_finding_model.py +++ b/unittests/test_finding_model.py @@ -558,7 +558,7 @@ class TestFindingSLAExpiration(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py index 8c2bd4080ad..f7e8b9660c9 100644 --- a/unittests/test_import_execution_mode.py +++ b/unittests/test_import_execution_mode.py @@ -59,12 +59,31 @@ def test_invalid_override_ignored(self): def test_no_user(self): self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC, Dojo_User.resolve_deduplication_execution_mode(None)) - def test_wants_block_execution_only_for_sync_mode(self): - self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_SYNC) + def test_block_execution_falls_back_to_sync(self): + # legacy global block_execution flag implies synchronous deduplication + UserContactInfo.objects.update_or_create(user=self.user, defaults={"block_execution": True}) + self.user.refresh_from_db() + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_SYNC, Dojo_User.resolve_deduplication_execution_mode(self.user)) + + def test_mode_takes_precedence_over_block_execution(self): + UserContactInfo.objects.update_or_create( + user=self.user, + defaults={"block_execution": True, "deduplication_execution_mode": DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT}, + ) + self.user.refresh_from_db() + self.assertEqual(DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT, Dojo_User.resolve_deduplication_execution_mode(self.user)) + + def test_wants_block_execution_reads_block_execution_not_mode(self): + # wants_block_execution is the global switch and is independent of the dedup mode + UserContactInfo.objects.update_or_create(user=self.user, defaults={"block_execution": True}) + self.user.refresh_from_db() self.assertTrue(Dojo_User.wants_block_execution(self.user)) - self._set_profile(mode=DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT) - self.assertFalse(Dojo_User.wants_block_execution(self.user)) - self._set_profile(mode=None) + UserContactInfo.objects.update_or_create( + user=self.user, + defaults={"block_execution": False, "deduplication_execution_mode": DEDUPLICATION_EXECUTION_MODE_SYNC}, + ) + self.user.refresh_from_db() + # a 'sync' dedup mode alone does NOT force global foreground execution self.assertFalse(Dojo_User.wants_block_execution(self.user)) diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 486d4e4eef2..417fe0ea9ea 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -2323,7 +2323,7 @@ def __init__(self, *args, **kwargs): def setUp(self): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') @@ -3122,7 +3122,7 @@ def __init__(self, *args, **kwargs): def setUp(self): # still using the API to verify results testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() # self.url = reverse(self.viewname + '-list') diff --git a/unittests/test_importers_deduplication.py b/unittests/test_importers_deduplication.py index f4c692c7101..7c1359dcc36 100644 --- a/unittests/test_importers_deduplication.py +++ b/unittests/test_importers_deduplication.py @@ -37,7 +37,7 @@ def setUp(self): testuser.is_superuser = True testuser.is_staff = True testuser.save() - UserContactInfo.objects.create(user=testuser, deduplication_execution_mode="sync") + UserContactInfo.objects.create(user=testuser, block_execution=True) # Authenticate API client as admin for import endpoints self.login_as_admin() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 3fbe7cb84a4..47d30a99824 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -67,7 +67,7 @@ def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.update_or_create(user=testuser, defaults={"deduplication_execution_mode": None}) + UserContactInfo.objects.update_or_create(user=testuser, defaults={"block_execution": False}) self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) @@ -363,7 +363,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self._import_reimport_performance( @@ -387,7 +387,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -541,7 +541,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self._deduplication_performance( @@ -653,7 +653,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self._import_reimport_performance( @@ -677,7 +677,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr configure_pghistory_triggers() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.system_settings(enable_product_grade=True) @@ -805,7 +805,7 @@ def test_deduplication_performance_pghistory_no_async(self): self.system_settings(enable_deduplication=True) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self._deduplication_performance( diff --git a/unittests/test_jira_config_engagement.py b/unittests/test_jira_config_engagement.py index ca730fde0de..aaabfdddc5e 100644 --- a/unittests/test_jira_config_engagement.py +++ b/unittests/test_jira_config_engagement.py @@ -252,7 +252,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.deduplication_execution_mode = "sync" + self.user.usercontactinfo.block_execution = True self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_config_engagement_epic.py b/unittests/test_jira_config_engagement_epic.py index a77d005839a..614a10fbe57 100644 --- a/unittests/test_jira_config_engagement_epic.py +++ b/unittests/test_jira_config_engagement_epic.py @@ -39,7 +39,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.user = self.get_test_admin() self.client.force_login(self.user) - self.user.usercontactinfo.deduplication_execution_mode = "sync" + self.user.usercontactinfo.block_execution = True self.user.usercontactinfo.save() # product 3 has no jira project config, double check to make sure someone didn't molest the fixture # running this in __init__ throws database access denied error diff --git a/unittests/test_jira_import_and_pushing_api.py b/unittests/test_jira_import_and_pushing_api.py index 89135bfada4..eb3f0692dbc 100644 --- a/unittests/test_jira_import_and_pushing_api.py +++ b/unittests/test_jira_import_and_pushing_api.py @@ -73,7 +73,7 @@ def setUp(self): self.system_settings(enable_jira=True) self.system_settings(enable_webhooks_notifications=True) self.testuser = User.objects.get(username="admin") - self.testuser.usercontactinfo.deduplication_execution_mode = "sync" + self.testuser.usercontactinfo.block_execution = True self.testuser.usercontactinfo.save() token = Token.objects.get(user=self.testuser) self.client = APIClient() diff --git a/unittests/test_notifications.py b/unittests/test_notifications.py index 1ff9adb2d07..504e168e70b 100644 --- a/unittests/test_notifications.py +++ b/unittests/test_notifications.py @@ -641,9 +641,9 @@ class TestAsyncNotificationTaskBody(DojoTestCase): def run(self, result=None): # Same sync pattern used by TestNotificationWebhooks: run under an impersonated user - # with deduplication_execution_mode="sync" so downstream dojo_dispatch_task calls execute inline. + # with block_execution=True so downstream dojo_dispatch_task calls execute inline. testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() with impersonate(testuser): super().run(result) @@ -677,8 +677,8 @@ def test_async_task_returns_silently_on_missing_instance(self, mock_process): @patch("dojo.notifications.helper.create_notification") def test_dispatch_respects_block_execution(self, mock_create): - """With deduplication_execution_mode="sync" on the impersonated user, the post_save signal runs the task body inline.""" - # The run() wrapper impersonates admin with deduplication_execution_mode="sync", so + """With block_execution=True on the impersonated user, the post_save signal runs the task body inline.""" + # The run() wrapper impersonates admin with block_execution=True, so # dojo_dispatch_task takes the sync branch and the task body calls create_notification # (module-level helper inside async_create_notification) synchronously. prod = Product.objects.first() @@ -705,7 +705,7 @@ def run(self, result=None): if getattr(self, "__unittest_skip__", False): return super().run(result) testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_prepare_duplicates_for_delete.py b/unittests/test_prepare_duplicates_for_delete.py index e89c5fadd33..89ed84069db 100644 --- a/unittests/test_prepare_duplicates_for_delete.py +++ b/unittests/test_prepare_duplicates_for_delete.py @@ -45,7 +45,7 @@ def setUp(self): is_staff=True, is_superuser=True, ) - UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, block_execution=True) self.system_settings(enable_deduplication=False) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_product_grading.py b/unittests/test_product_grading.py index 7d3080dcee5..5f4680c3315 100644 --- a/unittests/test_product_grading.py +++ b/unittests/test_product_grading.py @@ -12,7 +12,7 @@ class ProductGradeTest(DojoTestCase): def run(self, result=None): testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.save() # unit tests are running without any user, which will result in actions like dedupe happening in the celery process diff --git a/unittests/test_reimport_prefetch.py b/unittests/test_reimport_prefetch.py index 207ed54394d..cc5b2f40e1e 100644 --- a/unittests/test_reimport_prefetch.py +++ b/unittests/test_reimport_prefetch.py @@ -72,7 +72,7 @@ class ReimportDuplicateFindingsTestBase(DojoTestCase): def setUp(self): super().setUp() testuser, _ = User.objects.get_or_create(username="admin") - UserContactInfo.objects.get_or_create(user=testuser, defaults={"deduplication_execution_mode": "sync"}) + UserContactInfo.objects.get_or_create(user=testuser, defaults={"block_execution": True}) self.system_settings(enable_deduplication=True) self.system_settings(enable_product_grade=False) diff --git a/unittests/test_tag_inheritance.py b/unittests/test_tag_inheritance.py index 40007bee800..f1f9dce8505 100644 --- a/unittests/test_tag_inheritance.py +++ b/unittests/test_tag_inheritance.py @@ -713,7 +713,7 @@ class InheritedTagsImportTestAPI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False @@ -731,7 +731,7 @@ class InheritedTagsImportTestUI(DojoAPITestCase, InheritedTagsImportMixin): def setUp(self): super().setUp() testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() settings.SECURE_SSL_REDIRECT = False diff --git a/unittests/test_tags.py b/unittests/test_tags.py index 6d35b2a87e9..ea460f247bf 100644 --- a/unittests/test_tags.py +++ b/unittests/test_tags.py @@ -387,7 +387,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() TagImportMixin.setUp(self) @@ -404,7 +404,7 @@ def setUp(self): super().setUp() settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") - testuser.usercontactinfo.deduplication_execution_mode = "sync" + testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self.login_as_admin() self.client_ui = Client() diff --git a/unittests/test_watson_async_search_index.py b/unittests/test_watson_async_search_index.py index 916607b92a7..8edba606ac7 100644 --- a/unittests/test_watson_async_search_index.py +++ b/unittests/test_watson_async_search_index.py @@ -20,7 +20,7 @@ def setUp(self): super().setUp() self.testuser = User.objects.create(username="admin", is_staff=True, is_superuser=True) - UserContactInfo.objects.create(user=self.testuser, deduplication_execution_mode="sync") + UserContactInfo.objects.create(user=self.testuser, block_execution=True) self.system_settings(enable_product_grade=False) self.system_settings(enable_github=False) From cd60613f2d244eb91a5b8fce483ffd61705bd350 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 15:41:41 +0200 Subject: [PATCH 07/29] docs: 2.60 upgrade notes for deduplication_execution_mode --- docs/content/en/open_source/upgrading/2.60.md | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/content/en/open_source/upgrading/2.60.md b/docs/content/en/open_source/upgrading/2.60.md index e7811aa0b99..529dd8ce54c 100644 --- a/docs/content/en/open_source/upgrading/2.60.md +++ b/docs/content/en/open_source/upgrading/2.60.md @@ -2,6 +2,45 @@ title: 'Upgrading to DefectDojo Version 2.60.x' toc_hide: true weight: -20260601 -description: No special instructions. +description: New deduplication execution mode for import/reimport. --- -There are no special instructions for upgrading to 2.60.x. Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.60.0) for the contents of the release. + +## Deduplication execution mode for import/reimport + +This release adds a new `deduplication_execution_mode` setting that controls how +import/reimport deduplication post-processing is dispatched and whether the API +response waits for it. It can be set per user (profile) and overridden per request +on the import and reimport endpoints. + +Modes: + +- `async` (default): deduplication and the rest of post-processing are dispatched + to the background and the response returns immediately. This is the historical + behavior; nothing changes for existing users. +- `async_wait`: post-processing is still dispatched to the background, but the + request waits for deduplication to finish before responding. As a result the + `scan_added` notification and the statistics in the import/reimport response + reflect the deduplicated state (findings that turned out to be duplicates are + no longer counted/listed as new). JIRA push, product grading and other + non-deduplication tasks remain asynchronous and are not awaited. +- `sync`: import deduplication runs inline in the web request. + +The wait in `async_wait` is bounded by the new `DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT` +environment variable (default `60` seconds). If no worker picks up the work within +the timeout, the request responds anyway (degrading to the `async` outcome) rather +than hanging. + +The import/reimport response now also includes a `deduplication_complete` boolean +indicating whether deduplication had finished by the time the response was produced. + +### Relationship to `block_execution` + +The existing `block_execution` profile flag is unchanged. It remains the global +switch that forces **all** of a user's asynchronous tasks (notifications, JIRA +push, product grading, deduplication, ...) to run in the foreground. +`deduplication_execution_mode` is independent and narrower — it only affects +import/reimport deduplication post-processing. A user who has `block_execution` +enabled continues to get fully synchronous imports; the upgrade migration seeds +their `deduplication_execution_mode` to `sync` so behavior is unchanged. + +No action is required to upgrade. Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.60.0) for the contents of the release. From 3eb9d248b902d52fd0533d5e1fd1848b16c2d822 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 16:00:52 +0200 Subject: [PATCH 08/29] fix(importers): honor profile deduplication_execution_mode for UI import/reimport The API resolves deduplication_execution_mode in the serializer, but the UI import/reimport views built their context straight from the form and never resolved it, so UI imports silently defaulted to async regardless of the user's profile. Resolve it from the profile in both UI process_form paths. --- dojo/engagement/ui/views.py | 4 ++++ dojo/test/ui/views.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/dojo/engagement/ui/views.py b/dojo/engagement/ui/views.py index 475ff9cf5f9..aeca7c2ad99 100644 --- a/dojo/engagement/ui/views.py +++ b/dojo/engagement/ui/views.py @@ -948,6 +948,10 @@ def process_form( "create_finding_groups_for_all_findings": form.cleaned_data.get("create_finding_groups_for_all_findings", None), "environment": self.get_development_environment(environment_name=form.cleaned_data.get("environment")), }) + # Honor the user's profile deduplication_execution_mode for UI imports. The API resolves + # this in the serializer; the UI has no per-import selector, so fall back to the profile + # (or block_execution) instead of silently defaulting to async. + context["deduplication_execution_mode"] = Dojo_User.resolve_deduplication_execution_mode(request.user) # Create the engagement if necessary self.create_engagement(context) # close_old_findings_product_scope is a modifier of close_old_findings. diff --git a/dojo/test/ui/views.py b/dojo/test/ui/views.py index b89c53cd4e3..b549a1c2236 100644 --- a/dojo/test/ui/views.py +++ b/dojo/test/ui/views.py @@ -44,6 +44,7 @@ from dojo.location.models import Location from dojo.models import ( BurpRawRequestResponse, + Dojo_User, Endpoint, Finding, Finding_Group, @@ -950,6 +951,10 @@ def process_form( "close_old_findings": form.cleaned_data.get("close_old_findings", None), "create_finding_groups_for_all_findings": form.cleaned_data.get("create_finding_groups_for_all_findings", None), }) + # Honor the user's profile deduplication_execution_mode for UI reimports. The API resolves + # this in the serializer; the UI has no per-import selector, so fall back to the profile + # (or block_execution) instead of silently defaulting to async. + context["deduplication_execution_mode"] = Dojo_User.resolve_deduplication_execution_mode(request.user) # Override the form values of active and verified if activeChoice := form.cleaned_data.get("active", None): if activeChoice == "force_to_true": From c168da8c196ba97be21beca5f61f2177601e7771 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 16:16:59 +0200 Subject: [PATCH 09/29] test: revert set_block_execution to the block_execution checkbox Option B keeps block_execution as a checkbox in the profile form, so the integration helper toggles id_block_execution again instead of the deduplication_execution_mode select (which no longer existed under that id). --- tests/base_test_class.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/base_test_class.py b/tests/base_test_class.py index 2798d2f2a58..d9043a43fd5 100644 --- a/tests/base_test_class.py +++ b/tests/base_test_class.py @@ -10,7 +10,7 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.ui import Select, WebDriverWait +from selenium.webdriver.support.ui import WebDriverWait # import time logging.basicConfig( @@ -329,24 +329,22 @@ def enable_github(self): return self.enable_system_setting("id_enable_github") def set_block_execution(self, *, block_execution=True): - # we set the admin user (ourselves) to the synchronous import execution mode - # (the successor of the old block_execution flag) when block_execution=True. - # This forces dedupe to happen synchronously, among other things like - # notifications, rules, ... Otherwise we select the default async mode. - target_mode = "sync" if block_execution else "async" - logger.info("setting import execution mode to: %s", target_mode) + # we set the admin user (ourselves) to have block_execution checked + # this will force dedupe to happen synchronously, among other things like notifications, rules, ... + logger.info("setting block execution to: %s", block_execution) driver = self.driver driver.get(self.base_url + "profile") - select = Select(driver.find_element(By.ID, "id_import_execution_mode")) - if select.first_selected_option.get_attribute("value") != target_mode: - select.select_by_value(target_mode) + if ( + driver.find_element(By.ID, "id_block_execution").is_selected() + != block_execution + ): + driver.find_element(By.XPATH, '//*[@id="id_block_execution"]').click() # save settings driver.find_element(By.CSS_SELECTOR, "input.btn.btn-primary").click() - # check if it's applied after reload - select = Select(driver.find_element(By.ID, "id_import_execution_mode")) + # check if it's enabled after reload self.assertEqual( - select.first_selected_option.get_attribute("value"), - target_mode, + driver.find_element(By.ID, "id_block_execution").is_selected(), + block_execution, ) return driver From 6a16c83ee137e27259ee11e8cdba36bdff808312 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 16:40:08 +0200 Subject: [PATCH 10/29] refactor(migrations): split into schema add + data seed (2 migrations) Collapse the add/rename pair into a single AddField that creates deduplication_execution_mode with its final name, and keep the seed as a separate data migration, per the convention of keeping schema and data migrations apart. Drops the interim rename migration. --- ...rcontactinfo_deduplication_execution_mode.py} | 2 +- .../0270_seed_deduplication_execution_mode.py | 6 +++--- ...rename_import_execution_mode_deduplication.py | 16 ---------------- 3 files changed, 4 insertions(+), 20 deletions(-) rename dojo/db_migrations/{0269_usercontactinfo_import_execution_mode.py => 0269_usercontactinfo_deduplication_execution_mode.py} (95%) delete mode 100644 dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py diff --git a/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py b/dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py similarity index 95% rename from dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py rename to dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py index d8219e5596c..94f1e29f255 100644 --- a/dojo/db_migrations/0269_usercontactinfo_import_execution_mode.py +++ b/dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py @@ -10,7 +10,7 @@ class Migration(migrations.Migration): operations = [ migrations.AddField( model_name='usercontactinfo', - name='import_execution_mode', + name='deduplication_execution_mode', field=models.CharField(blank=True, choices=[('async', 'Async (do not wait)'), ('async_wait', 'Async, wait for deduplication'), ('sync', 'Synchronous (block)')], help_text="Controls how import/reimport deduplication post-processing is executed. 'Async' dispatches it to the background and returns immediately (default). 'Async, wait for deduplication' dispatches to the background but waits for deduplication to finish before responding, so notifications and statistics reflect the deduplicated state. 'Synchronous' runs the import deduplication inline. Can be overridden per request. Independent of block_execution, which forces all async tasks (notifications, jira, ...) to the foreground.", max_length=20, null=True), ), ] diff --git a/dojo/db_migrations/0270_seed_deduplication_execution_mode.py b/dojo/db_migrations/0270_seed_deduplication_execution_mode.py index 862a2b0d929..7e56f1674f4 100644 --- a/dojo/db_migrations/0270_seed_deduplication_execution_mode.py +++ b/dojo/db_migrations/0270_seed_deduplication_execution_mode.py @@ -10,19 +10,19 @@ def seed_deduplication_execution_mode(apps, schema_editor): unchanged for them. """ UserContactInfo = apps.get_model("dojo", "UserContactInfo") - UserContactInfo.objects.filter(block_execution=True).update(import_execution_mode="sync") + UserContactInfo.objects.filter(block_execution=True).update(deduplication_execution_mode="sync") def unseed_deduplication_execution_mode(apps, schema_editor): """Reverse: clear the seeded synchronous mode.""" UserContactInfo = apps.get_model("dojo", "UserContactInfo") - UserContactInfo.objects.filter(import_execution_mode="sync").update(import_execution_mode=None) + UserContactInfo.objects.filter(deduplication_execution_mode="sync").update(deduplication_execution_mode=None) class Migration(migrations.Migration): dependencies = [ - ('dojo', '0269_usercontactinfo_import_execution_mode'), + ('dojo', '0269_usercontactinfo_deduplication_execution_mode'), ] operations = [ diff --git a/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py b/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py deleted file mode 100644 index 1b136c9050b..00000000000 --- a/dojo/db_migrations/0271_rename_import_execution_mode_deduplication.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('dojo', '0270_seed_deduplication_execution_mode'), - ] - - operations = [ - migrations.RenameField( - model_name='usercontactinfo', - old_name='import_execution_mode', - new_name='deduplication_execution_mode', - ), - ] From 7c07cd4896ac075e4487e1f9c8d71cd976965e7a Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 16:46:44 +0200 Subject: [PATCH 11/29] test: use versioned_fixtures so dedup mode tests pass under V3_FEATURE_LOCATIONS The CI 'true' matrix variant enables V3_FEATURE_LOCATIONS, where loading dojo_testdata.json (containing Endpoints) raises NotImplementedError. Decorate the test classes with @versioned_fixtures so the locations fixture is used in that mode, matching the other import/reimport test suites. --- unittests/test_import_execution_mode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py index f7e8b9660c9..e575b987c01 100644 --- a/unittests/test_import_execution_mode.py +++ b/unittests/test_import_execution_mode.py @@ -15,9 +15,10 @@ UserContactInfo, ) -from .dojo_test_case import DojoAPITestCase, DojoTestCase, get_unit_tests_path +from .dojo_test_case import DojoAPITestCase, DojoTestCase, get_unit_tests_path, versioned_fixtures +@versioned_fixtures class ImportExecutionModeResolverTest(DojoTestCase): """resolve_deduplication_execution_mode: request override > profile > default.""" @@ -87,6 +88,7 @@ def test_wants_block_execution_reads_block_execution_not_mode(self): self.assertFalse(Dojo_User.wants_block_execution(self.user)) +@versioned_fixtures class ImporterDispatchKwargsTest(DojoTestCase): """deduplication_execution_mode -> dojo_dispatch_task force flags.""" @@ -121,6 +123,7 @@ def test_external_force_sync_promotes_to_sync_mode(self): self.assertEqual(DEDUPLICATION_EXECUTION_MODE_SYNC, importer.deduplication_execution_mode) +@versioned_fixtures @override_settings(CELERY_TASK_ALWAYS_EAGER=True) class ImportExecutionModeAPITest(DojoAPITestCase): @@ -170,6 +173,7 @@ def test_import_rejects_invalid_mode(self): self.import_scan(payload, 400) +@versioned_fixtures class NotificationDeduplicationRefreshTest(DojoTestCase): """notify_scan_added refreshes duplicate status from the DB once dedup is complete.""" From 63466719ab96347f577e483f7d1e1f6e6e566786 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Jun 2026 22:04:14 +0200 Subject: [PATCH 12/29] test(perf): add async_wait deduplication performance test Covers the 'async_wait' execution mode: post-processing is dispatched to a background worker and the request joins on it before responding. The dedup queries run in the worker (separate connection), not the web request, so the only web-side delta over the plain async path is the post-dedup notification refresh SELECT (+1): 109->110 first import, 89->90 second. Does not use CELERY_TASK_ALWAYS_EAGER (that would run the task inline on the request connection and wrongly count worker queries). The dedup batch is dispatched async and the join (AsyncResult.get) is mocked to return instantly, simulating a finished worker. Adds an optional dedup_mode param to _deduplication_performance. --- unittests/test_importers_performance.py | 38 ++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 47d30a99824..8812f7287d4 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -403,7 +403,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr ) # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results. - def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True): + def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True, dedup_mode=None): """ Test method to measure deduplication performance by importing the same scan twice. The second import should result in all findings being marked as duplicates. @@ -444,6 +444,7 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t "verified": True, "scan_type": STACK_HAWK_SCAN_TYPE, "engagement": engagement, + **({"deduplication_execution_mode": dedup_mode} if dedup_mode else {}), } importer = DefaultImporter(**import_options) _, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan) @@ -471,6 +472,7 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t "verified": True, "scan_type": STACK_HAWK_SCAN_TYPE, "engagement": engagement, + **({"deduplication_execution_mode": dedup_mode} if dedup_mode else {}), } importer = DefaultImporter(**import_options) _, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan) @@ -551,6 +553,40 @@ def test_deduplication_performance_pghistory_no_async(self): expected_num_async_tasks2=2, ) + @override_settings(ENABLE_AUDITLOG=True) + def test_deduplication_performance_pghistory_async_wait(self): + """ + Deduplication performance in the 'async_wait' execution mode: post-processing is + dispatched to a background worker, then the request joins on the result before + responding. The dedup queries run in the worker (a separate connection), NOT in + the web request, so the only web-side cost over the plain async path is the single + post-dedup notification refresh SELECT (+1). + + We do not use CELERY_TASK_ALWAYS_EAGER here — that would run the dispatched task + inline on the request's connection and wrongly count the worker's dedup queries. + Instead the dedup batch is dispatched async (not executed in-process) and the join + (AsyncResult.get) is mocked to return immediately, simulating a worker that has + finished. deduplication_complete is therefore True (so the refresh runs), but the + findings are not actually deduplicated in-test, so check_duplicates is False. + """ + configure_audit_system() + configure_pghistory_triggers() + + # Enable deduplication + self.system_settings(enable_deduplication=True) + + # Simulate the background worker's post-processing having completed so the join + # returns instantly without executing dedup on the request's DB connection. + with patch("celery.result.AsyncResult.get", return_value=None): + self._deduplication_performance( + expected_num_queries1=93, + expected_num_async_tasks1=2, + expected_num_queries2=73, + expected_num_async_tasks2=2, + dedup_mode="async_wait", + check_duplicates=False, + ) + @tag("performance") @override_settings(V3_FEATURE_LOCATIONS=True) From cac3498cd3741b803c9edd9317509a38f0d455f2 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Thu, 25 Jun 2026 22:12:48 +0200 Subject: [PATCH 13/29] fix(migrations): renumber import-execution-mode migrations onto dev (0270/0271) dev added 0269_normalize_blank_finding_components, colliding with this branch's 0269. Renumber to 0270_usercontactinfo_deduplication_execution_mode and 0271_seed_deduplication_execution_mode and repoint the dependency chain onto dev's 0269. --- ....py => 0270_usercontactinfo_deduplication_execution_mode.py} | 2 +- ...cution_mode.py => 0271_seed_deduplication_execution_mode.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename dojo/db_migrations/{0269_usercontactinfo_deduplication_execution_mode.py => 0270_usercontactinfo_deduplication_execution_mode.py} (94%) rename dojo/db_migrations/{0270_seed_deduplication_execution_mode.py => 0271_seed_deduplication_execution_mode.py} (94%) diff --git a/dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py b/dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py similarity index 94% rename from dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py rename to dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py index 94f1e29f255..412bfe2d546 100644 --- a/dojo/db_migrations/0269_usercontactinfo_deduplication_execution_mode.py +++ b/dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py @@ -4,7 +4,7 @@ class Migration(migrations.Migration): dependencies = [ - ('dojo', '0268_release_authorization_to_pro'), + ('dojo', '0269_normalize_blank_finding_components'), ] operations = [ diff --git a/dojo/db_migrations/0270_seed_deduplication_execution_mode.py b/dojo/db_migrations/0271_seed_deduplication_execution_mode.py similarity index 94% rename from dojo/db_migrations/0270_seed_deduplication_execution_mode.py rename to dojo/db_migrations/0271_seed_deduplication_execution_mode.py index 7e56f1674f4..74154304acf 100644 --- a/dojo/db_migrations/0270_seed_deduplication_execution_mode.py +++ b/dojo/db_migrations/0271_seed_deduplication_execution_mode.py @@ -22,7 +22,7 @@ def unseed_deduplication_execution_mode(apps, schema_editor): class Migration(migrations.Migration): dependencies = [ - ('dojo', '0269_usercontactinfo_deduplication_execution_mode'), + ('dojo', '0270_usercontactinfo_deduplication_execution_mode'), ] operations = [ From e1f931c8a4e824ef30f51cc94f5bb5e47414cc43 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Thu, 25 Jun 2026 22:16:21 +0200 Subject: [PATCH 14/29] fix(importers): drop ignore_result=False override on post_process_findings_batch The async_wait deduplication join works with the global CELERY_TASK_IGNORE_RESULT=True default (verified live by reviewer against a real broker/worker). Removing the per-task override avoids writing a celery_taskmeta result row for every batch of every import (incl. plain async), which were retained for CELERY_RESULT_EXPIRES. --- dojo/finding/helper.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index a5ae551f367..d83d032b176 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -456,13 +456,7 @@ def post_process_finding_save_internal(finding, dedupe_option=True, rules_option jira_services.push(finding.finding_group) -# ignore_result=False so the 'async_wait' import execution mode can join on the -# dispatched batch via AsyncResult.get() even when CELERY_TASK_IGNORE_RESULT=True. -# NOTE: this override may not be strictly necessary — in the past .get()/await -# appears to have worked with the global CELERY_TASK_IGNORE_RESULT=True and a -# Redis broker. Needs verification against a real broker/worker setup; if join -# works without it, this override can be removed to avoid storing extra results. -@app.task(ignore_result=False) +@app.task def post_process_findings_batch( finding_ids, *args, From 6f39a0834b53ce358d62fa50a98cffbf98fd1988 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Thu, 25 Jun 2026 22:22:45 +0200 Subject: [PATCH 15/29] test(e2e): selenium integration test for async_wait deduplication mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds async_wait coverage to the existing tests/dedupe_test.py DedupeTest suite: set the user profile to 'async_wait', import a Checkmarx report into two tests of a dedupe-on-engagement engagement, and assert the duplicates are visible immediately (check_nb_duplicates_no_wait, no sleep/retry) — proving the request blocked until the cross-process deduplication finished. This exercises the real worker->request join that the eager unit tests cannot cover. Adds a set_deduplication_execution_mode() helper to base_test_class. --- tests/base_test_class.py | 20 ++++++++- tests/dedupe_test.py | 94 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/tests/base_test_class.py b/tests/base_test_class.py index d9043a43fd5..d94b5020aed 100644 --- a/tests/base_test_class.py +++ b/tests/base_test_class.py @@ -10,7 +10,7 @@ from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select, WebDriverWait # import time logging.basicConfig( @@ -354,6 +354,24 @@ def enable_block_execution(self): def disable_block_execution(self): self.set_block_execution(block_execution=False) + def set_deduplication_execution_mode(self, mode="async"): + # Set the admin user's (ourselves) deduplication_execution_mode profile + # field. "async_wait" makes import/reimport block until background + # deduplication has finished before returning. + logger.info("setting deduplication execution mode to: %s", mode) + driver = self.driver + driver.get(self.base_url + "profile") + select = Select(driver.find_element(By.ID, "id_deduplication_execution_mode")) + if select.first_selected_option.get_attribute("value") != mode: + select.select_by_value(mode) + # save settings + driver.find_element(By.CSS_SELECTOR, "input.btn.btn-primary").click() + # check it persisted after reload + driver.get(self.base_url + "profile") + select = Select(driver.find_element(By.ID, "id_deduplication_execution_mode")) + self.assertEqual(select.first_selected_option.get_attribute("value"), mode) + return driver + def enable_deduplication(self): return self.enable_system_setting("id_enable_deduplication") diff --git a/tests/dedupe_test.py b/tests/dedupe_test.py index d0ffee6fad2..2b6d50f119d 100644 --- a/tests/dedupe_test.py +++ b/tests/dedupe_test.py @@ -509,6 +509,93 @@ def test_check_service(self): # Since we imported the same report twice but with different service names, we should have no duplicates self.check_nb_duplicates(0) +# -------------------------------------------------------------------------------------------------------- +# 'async_wait' deduplication execution mode +# Unlike the tests above (which poll with retries while the background dedupe +# task catches up), 'async_wait' makes the import request block until dedupe +# has finished, so duplicates must be visible immediately. This exercises the +# real cross-process worker->request join that the eager unit tests cannot. +# -------------------------------------------------------------------------------------------------------- + def check_nb_duplicates_no_wait(self, expected_number_of_duplicates): + # Deliberately NO sleep/retry (contrast with check_nb_duplicates): in + # 'async_wait' mode the import only returns once dedupe has completed, so + # the duplicates must already be marked on the first findings-list load. + logger.debug("checking duplicates without waiting...") + driver = self.driver + self.goto_all_findings_list(driver) + dupe_count = 0 + trs = driver.find_elements(By.XPATH, '//*[@id="open_findings"]/tbody/tr') + for row in trs: + concatRow = " ".join([td.text for td in row.find_elements(By.XPATH, ".//td")]) + if "(DUPE)" and "Duplicate" in concatRow: + dupe_count += 1 + if dupe_count != expected_number_of_duplicates: + logger.debug(driver.find_element(By.ID, "open_findings").get_attribute("innerHTML")) + self.assertEqual( + dupe_count, expected_number_of_duplicates, + "duplicates were not visible immediately after an async_wait import " + "(the request did not block until deduplication finished)", + ) + + @on_exception_html_source_logger + def test_set_async_wait_mode(self): + self.set_deduplication_execution_mode("async_wait") + + @on_exception_html_source_logger + def test_reset_dedup_mode(self): + # Restore the default so later tests/suites are unaffected. + self.set_deduplication_execution_mode("async") + + @on_exception_html_source_logger + def test_add_async_wait_test_suite(self): + logger.debug("async_wait dedupe - creating engagement + two Checkmarx tests...") + driver = self.driver + self.goto_product_overview(driver) + driver.find_element(By.CSS_SELECTOR, ".dropdown-toggle.pull-left").click() + driver.find_element(By.LINK_TEXT, "Add New Engagement").click() + driver.find_element(By.ID, "id_name").send_keys("Dedupe Async Wait") + driver.find_element(By.XPATH, '//*[@id="id_deduplication_on_engagement"]').click() + driver.find_element(By.NAME, "_Add Tests").click() + self.assertTrue(self.is_success_message_present(text="Engagement added successfully.")) + # Test 1 + driver.find_element(By.ID, "id_title").send_keys("Async Wait Test 1") + Select(driver.find_element(By.ID, "id_test_type")).select_by_visible_text("Checkmarx Scan") + Select(driver.find_element(By.ID, "id_environment")).select_by_visible_text("Development") + driver.find_element(By.NAME, "_Add Another Test").click() + self.assertTrue(self.is_success_message_present(text="Test added successfully")) + # Test 2 + driver.find_element(By.ID, "id_title").send_keys("Async Wait Test 2") + Select(driver.find_element(By.ID, "id_test_type")).select_by_visible_text("Checkmarx Scan") + Select(driver.find_element(By.ID, "id_environment")).select_by_visible_text("Development") + driver.find_element(By.CSS_SELECTOR, "input.btn.btn-primary").click() + self.assertTrue(self.is_success_message_present(text="Test added successfully")) + + @on_exception_html_source_logger + def test_import_async_wait_tests(self): + logger.debug("importing the same Checkmarx report into both tests under async_wait...") + driver = self.driver + # Test 1 - same report imported into both tests; the second import's + # findings deduplicate against the first. + self.goto_active_engagements_overview(driver) + driver.find_element(By.PARTIAL_LINK_TEXT, "Dedupe Async Wait").click() + driver.find_element(By.PARTIAL_LINK_TEXT, "Async Wait Test 1").click() + driver.find_element(By.ID, "dropdownMenu1").click() + driver.find_element(By.LINK_TEXT, "Re-Upload Scan").click() + driver.find_element(By.ID, "id_file").send_keys(str(self.relative_path / "dedupe_scans" / "multiple_findings.xml")) + driver.find_elements(By.CSS_SELECTOR, "button.btn.btn-primary")[1].click() + self.assertTrue(self.is_success_message_present(text="a total of 2 findings")) + # Test 2 - second import of the same findings -> 2 duplicates. + self.goto_active_engagements_overview(driver) + driver.find_element(By.PARTIAL_LINK_TEXT, "Dedupe Async Wait").click() + driver.find_element(By.PARTIAL_LINK_TEXT, "Async Wait Test 2").click() + driver.find_element(By.ID, "dropdownMenu1").click() + driver.find_element(By.LINK_TEXT, "Re-Upload Scan").click() + driver.find_element(By.ID, "id_file").send_keys(str(self.relative_path / "dedupe_scans" / "multiple_findings.xml")) + driver.find_elements(By.CSS_SELECTOR, "button.btn.btn-primary")[1].click() + self.assertTrue(self.is_success_message_present(text="a total of 2 findings")) + # No sleep/retry: async_wait must have blocked until dedupe finished. + self.check_nb_duplicates_no_wait(2) + def add_dedupe_tests_to_suite(suite, *, jira=False, github=False, block_execution=False): suite.addTest(BaseTestCase("test_login")) @@ -561,6 +648,13 @@ def add_dedupe_tests_to_suite(suite, *, jira=False, github=False, block_executio suite.addTest(DedupeTest("test_delete_findings")) suite.addTest(DedupeTest("test_import_service")) suite.addTest(DedupeTest("test_check_service")) + # 'async_wait' execution mode: the import request blocks until dedupe + # completes, so duplicates are visible immediately (no polling needed). + suite.addTest(DedupeTest("test_set_async_wait_mode")) + suite.addTest(DedupeTest("test_delete_findings")) + suite.addTest(DedupeTest("test_add_async_wait_test_suite")) + suite.addTest(DedupeTest("test_import_async_wait_tests")) + suite.addTest(DedupeTest("test_reset_dedup_mode")) # Clean up suite.addTest(ProductTest("test_delete_product")) return suite From 202ccbd08f96170c5030892ee0d3bb96b63cad4a Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 26 Jun 2026 22:44:24 +0200 Subject: [PATCH 16/29] fix(importers): make async_wait actually join via per-dispatch ignore_result The async_wait join via AsyncResult.get() was a silent no-op: the global CELERY_TASK_IGNORE_RESULT=True means post_process_findings_batch never stores a result, so .get() returns immediately instead of blocking. The import responded with deduplication_complete=true while dedup had not run. Pass ignore_result=False only on the async_wait dispatch, threaded through dojo_dispatch_task to apply_async. The task decorator stays plain @app.task, so async/sync imports do not write useless celery_taskmeta result rows. --- dojo/celery_dispatch.py | 10 +++++++++- dojo/importers/default_importer.py | 3 +++ dojo/importers/default_reimporter.py | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dojo/celery_dispatch.py b/dojo/celery_dispatch.py index c4eee2aef08..445cf99fc1b 100644 --- a/dojo/celery_dispatch.py +++ b/dojo/celery_dispatch.py @@ -76,6 +76,11 @@ def dojo_dispatch_task(task_or_sig: _SupportsSi | _SupportsApplyAsync | Signatur from dojo.decorators import dojo_async_task_counter, we_want_async # noqa: PLC0415 circular import countdown = cast("int", kwargs.pop("countdown", 0)) + # Per-dispatch result storage. The task default is `ignore_result` (global + # CELERY_TASK_IGNORE_RESULT=True), so AsyncResult.get() is a no-op. Callers + # that need to join on the result later (e.g. import 'async_wait' mode) pass + # ignore_result=False to force this one dispatch to store its result. + ignore_result = kwargs.pop("ignore_result", None) injected = _inject_async_user(kwargs) injected = _inject_pghistory_context(injected) @@ -84,7 +89,10 @@ def dojo_dispatch_task(task_or_sig: _SupportsSi | _SupportsApplyAsync | Signatur if we_want_async(*sig.args, func=getattr(sig, "type", None), **sig_kwargs): # DojoAsyncTask.apply_async tracks async dispatch. Avoid double-counting here. - return sig.apply_async(countdown=countdown) + apply_kwargs = {"countdown": countdown} + if ignore_result is not None: + apply_kwargs["ignore_result"] = ignore_result + return sig.apply_async(**apply_kwargs) # Track foreground execution as a "created task" as well (matches historical dojo_async_task behavior) dojo_async_task_counter.incr(str(sig.task), args=sig.args, kwargs=sig_kwargs) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index d7d6d6e09a8..377cee94a51 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -305,6 +305,9 @@ def _process_findings_internal( product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira, + # 'async_wait' joins on this dispatch via AsyncResult.get(), so its + # result must be stored despite the global CELERY_TASK_IGNORE_RESULT. + **({"ignore_result": False} if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT else {}), **self.post_processing_dispatch_kwargs(**kwargs), ) if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 9845c77b227..70bb9007250 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -471,6 +471,9 @@ def _process_findings_internal( issue_updater_option=True, push_to_jira=push_to_jira, jira_instance_id=getattr(self.jira_instance, "id", None), + # 'async_wait' joins on this dispatch via AsyncResult.get(), so its + # result must be stored despite the global CELERY_TASK_IGNORE_RESULT. + **({"ignore_result": False} if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT else {}), **self.post_processing_dispatch_kwargs(**kwargs), ) if self.deduplication_execution_mode == DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT: From 9a096ec25f129e27fe6818d2cc9f0d682fb64ea9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 26 Jun 2026 22:44:32 +0200 Subject: [PATCH 17/29] test(dedupe): replace Selenium async_wait test with real-worker API guard The Selenium async_wait test could not fail when the cross-process join was broken: with only 2 findings and several page navigations between the import and the duplicate check, the background dedupe finished regardless of whether the import actually blocked. It also never read deduplication_complete. Remove it and add ImportAsyncWaitApiTest: a pure-API test that imports a 400-finding report twice into a dedup-on-engagement engagement against the real worker, then asserts (no sleep/retry) that async_wait reports deduplication_complete=true with all 400 marked duplicate at response time, while plain async returns incomplete with fewer marked. A no-op join makes async_wait behave like async and fails the test. --- tests/dedupe_test.py | 269 +++++++++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 88 deletions(-) diff --git a/tests/dedupe_test.py b/tests/dedupe_test.py index 2b6d50f119d..97009ee4c77 100644 --- a/tests/dedupe_test.py +++ b/tests/dedupe_test.py @@ -1,3 +1,4 @@ +import json import logging import os import sys @@ -5,6 +6,7 @@ import unittest from pathlib import Path +import requests from base_test_class import BaseTestCase, on_exception_html_source_logger, set_suite_settings from product_test import ProductTest from selenium.common.exceptions import TimeoutException @@ -509,92 +511,186 @@ def test_check_service(self): # Since we imported the same report twice but with different service names, we should have no duplicates self.check_nb_duplicates(0) -# -------------------------------------------------------------------------------------------------------- -# 'async_wait' deduplication execution mode -# Unlike the tests above (which poll with retries while the background dedupe -# task catches up), 'async_wait' makes the import request block until dedupe -# has finished, so duplicates must be visible immediately. This exercises the -# real cross-process worker->request join that the eager unit tests cannot. -# -------------------------------------------------------------------------------------------------------- - def check_nb_duplicates_no_wait(self, expected_number_of_duplicates): - # Deliberately NO sleep/retry (contrast with check_nb_duplicates): in - # 'async_wait' mode the import only returns once dedupe has completed, so - # the duplicates must already be marked on the first findings-list load. - logger.debug("checking duplicates without waiting...") - driver = self.driver - self.goto_all_findings_list(driver) - dupe_count = 0 - trs = driver.find_elements(By.XPATH, '//*[@id="open_findings"]/tbody/tr') - for row in trs: - concatRow = " ".join([td.text for td in row.find_elements(By.XPATH, ".//td")]) - if "(DUPE)" and "Duplicate" in concatRow: - dupe_count += 1 - if dupe_count != expected_number_of_duplicates: - logger.debug(driver.find_element(By.ID, "open_findings").get_attribute("innerHTML")) - self.assertEqual( - dupe_count, expected_number_of_duplicates, - "duplicates were not visible immediately after an async_wait import " - "(the request did not block until deduplication finished)", + +class ImportAsyncWaitApiTest(unittest.TestCase): + + """ + Deterministic API test for the import 'async_wait' deduplication mode. + + The Selenium async_wait test above is only a UI smoke test: with 2 findings + and several page navigations between the import and the duplicate check, the + background dedupe finishes regardless of whether the import actually blocked, + so it cannot fail when the cross-process join is broken. The eager unit tests + (CELERY_TASK_ALWAYS_EAGER) and the mocked perf test can't catch it either. + + This test runs against the real docker-compose stack: a separate celeryworker + process + broker, with the global CELERY_TASK_IGNORE_RESULT in effect. It is + the only coverage that can fail when the join is a no-op. + + How it stays deterministic without timing hacks or production test hooks: + - It imports a *large* report (LARGE_N findings) twice into a dedup-on- + engagement engagement, so the second import's findings deduplicate against + the first. + - Background deduplication of LARGE_N findings takes ~1s of real worker time, + far longer than the single API round-trip used to observe the result. + - 'async_wait' must block until that work finishes, so duplicates are ALL + marked by the time the import response returns -> observed count == LARGE_N. + - plain 'async' does not block, so at response time essentially none are + marked yet -> observed count < LARGE_N (and deduplication_complete False). + A broken/no-op join makes 'async_wait' behave exactly like 'async' (count < + LARGE_N), which fails this test. + """ + + # Large enough that real background dedup cannot finish within the single + # observation round-trip, but small enough to keep the test fast (~seconds). + LARGE_N = 400 + + @classmethod + def setUpClass(cls): + cls.base_url = os.environ["DD_BASE_URL"].rstrip("/") + cls.api = cls.base_url + "/api/v2" + resp = requests.post( + cls.api + "/api-token-auth/", + data={ + "username": os.environ["DD_ADMIN_USER"], + "password": os.environ["DD_ADMIN_PASSWORD"], + }, + timeout=30, + ) + resp.raise_for_status() + cls.headers = {"Authorization": "Token " + resp.json()["token"]} + # Deduplication must be enabled globally for the mode to do anything. + cls._patch("/system_settings/1/", {"enable_deduplication": True}) + + # --- thin API helpers ------------------------------------------------- + @classmethod + def _get(cls, path, **params): + r = requests.get(cls.api + path, headers=cls.headers, params=params, timeout=60) + r.raise_for_status() + return r.json() + + @classmethod + def _post(cls, path, payload): + r = requests.post(cls.api + path, headers=cls.headers, json=payload, timeout=60) + r.raise_for_status() + return r.json() + + @classmethod + def _patch(cls, path, payload): + r = requests.patch(cls.api + path, headers=cls.headers, json=payload, timeout=60) + r.raise_for_status() + return r.json() + + # --- fixtures --------------------------------------------------------- + def _make_engagement(self, name): + """Create a product + dedup-on-engagement engagement, return its id.""" + suffix = f"{os.getpid()}-{name}" + prod_types = self._get("/product_types/", limit=1)["results"] + prod_type_id = ( + prod_types[0]["id"] if prod_types + else self._post("/product_types/", {"name": f"async_wait pt {suffix}"})["id"] ) + product = self._post("/products/", { + "name": f"async_wait prod {suffix}", + "description": "async_wait integration test", + "prod_type": prod_type_id, + }) + engagement = self._post("/engagements/", { + "name": f"async_wait eng {suffix}", + "product": product["id"], + "target_start": "2020-01-01", + "target_end": "2030-01-01", + "deduplication_on_engagement": True, + "engagement_type": "CI/CD", + }) + return engagement["id"] + + def _generic_report(self): + """ + A Generic Findings Import report of LARGE_N unique findings. - @on_exception_html_source_logger - def test_set_async_wait_mode(self): - self.set_deduplication_execution_mode("async_wait") + Re-importing the identical content into a second test of the same + dedup-on-engagement engagement marks all LARGE_N as duplicates. + """ + findings = [ + { + "title": f"async_wait finding {i}", + "severity": "High", + "description": f"async_wait dedup finding number {i}", + } + for i in range(self.LARGE_N) + ] + return json.dumps({"findings": findings}) + + def _import(self, engagement_id, mode): + """POST /import-scan and return (response_json, test_id).""" + report = self._generic_report() + resp = requests.post( + self.api + "/import-scan/", + headers=self.headers, + data={ + "scan_type": "Generic Findings Import", + "engagement": engagement_id, + "minimum_severity": "Info", + "active": True, + "verified": False, + "deduplication_execution_mode": mode, + }, + files={"file": ("report.json", report, "application/json")}, + timeout=120, + ) + self.assertEqual(resp.status_code, 201, resp.text) + body = resp.json() + return body, body["test"] + + def _duplicates_marked(self, test_id): + """Count findings flagged duplicate in a test, WITHOUT any wait/retry.""" + return self._get("/findings/", test=test_id, duplicate=True, limit=1)["count"] + + # --- tests ------------------------------------------------------------ + def test_async_wait_blocks_until_dedupe_complete(self): + """async_wait: response reports completion AND all dupes already marked.""" + engagement_id = self._make_engagement("wait") + # First import populates the engagement. + self._import(engagement_id, "async_wait") + # Second identical import deduplicates against the first. + body, test_id = self._import(engagement_id, "async_wait") + + self.assertTrue( + body.get("deduplication_complete"), + f"async_wait did not report deduplication_complete: {body}", + ) + # No sleep/retry: async_wait must have blocked until dedupe finished, so + # every finding in the second import is already marked duplicate. + marked = self._duplicates_marked(test_id) + self.assertEqual( + marked, self.LARGE_N, + f"async_wait returned with only {marked}/{self.LARGE_N} duplicates marked " + "-> the cross-process join did not block until deduplication finished", + ) - @on_exception_html_source_logger - def test_reset_dedup_mode(self): - # Restore the default so later tests/suites are unaffected. - self.set_deduplication_execution_mode("async") + def test_async_does_not_block(self): + """ + Control: plain async returns before dedupe -> incomplete + few marked. - @on_exception_html_source_logger - def test_add_async_wait_test_suite(self): - logger.debug("async_wait dedupe - creating engagement + two Checkmarx tests...") - driver = self.driver - self.goto_product_overview(driver) - driver.find_element(By.CSS_SELECTOR, ".dropdown-toggle.pull-left").click() - driver.find_element(By.LINK_TEXT, "Add New Engagement").click() - driver.find_element(By.ID, "id_name").send_keys("Dedupe Async Wait") - driver.find_element(By.XPATH, '//*[@id="id_deduplication_on_engagement"]').click() - driver.find_element(By.NAME, "_Add Tests").click() - self.assertTrue(self.is_success_message_present(text="Engagement added successfully.")) - # Test 1 - driver.find_element(By.ID, "id_title").send_keys("Async Wait Test 1") - Select(driver.find_element(By.ID, "id_test_type")).select_by_visible_text("Checkmarx Scan") - Select(driver.find_element(By.ID, "id_environment")).select_by_visible_text("Development") - driver.find_element(By.NAME, "_Add Another Test").click() - self.assertTrue(self.is_success_message_present(text="Test added successfully")) - # Test 2 - driver.find_element(By.ID, "id_title").send_keys("Async Wait Test 2") - Select(driver.find_element(By.ID, "id_test_type")).select_by_visible_text("Checkmarx Scan") - Select(driver.find_element(By.ID, "id_environment")).select_by_visible_text("Development") - driver.find_element(By.CSS_SELECTOR, "input.btn.btn-primary").click() - self.assertTrue(self.is_success_message_present(text="Test added successfully")) + Proves the assertion above is meaningful: with a non-blocking dispatch the + duplicates are NOT all marked at response time. A broken async_wait would + look exactly like this and so fail the test above. + """ + engagement_id = self._make_engagement("async") + self._import(engagement_id, "async") + body, test_id = self._import(engagement_id, "async") - @on_exception_html_source_logger - def test_import_async_wait_tests(self): - logger.debug("importing the same Checkmarx report into both tests under async_wait...") - driver = self.driver - # Test 1 - same report imported into both tests; the second import's - # findings deduplicate against the first. - self.goto_active_engagements_overview(driver) - driver.find_element(By.PARTIAL_LINK_TEXT, "Dedupe Async Wait").click() - driver.find_element(By.PARTIAL_LINK_TEXT, "Async Wait Test 1").click() - driver.find_element(By.ID, "dropdownMenu1").click() - driver.find_element(By.LINK_TEXT, "Re-Upload Scan").click() - driver.find_element(By.ID, "id_file").send_keys(str(self.relative_path / "dedupe_scans" / "multiple_findings.xml")) - driver.find_elements(By.CSS_SELECTOR, "button.btn.btn-primary")[1].click() - self.assertTrue(self.is_success_message_present(text="a total of 2 findings")) - # Test 2 - second import of the same findings -> 2 duplicates. - self.goto_active_engagements_overview(driver) - driver.find_element(By.PARTIAL_LINK_TEXT, "Dedupe Async Wait").click() - driver.find_element(By.PARTIAL_LINK_TEXT, "Async Wait Test 2").click() - driver.find_element(By.ID, "dropdownMenu1").click() - driver.find_element(By.LINK_TEXT, "Re-Upload Scan").click() - driver.find_element(By.ID, "id_file").send_keys(str(self.relative_path / "dedupe_scans" / "multiple_findings.xml")) - driver.find_elements(By.CSS_SELECTOR, "button.btn.btn-primary")[1].click() - self.assertTrue(self.is_success_message_present(text="a total of 2 findings")) - # No sleep/retry: async_wait must have blocked until dedupe finished. - self.check_nb_duplicates_no_wait(2) + self.assertFalse( + body.get("deduplication_complete"), + f"async unexpectedly reported deduplication_complete: {body}", + ) + marked = self._duplicates_marked(test_id) + self.assertLess( + marked, self.LARGE_N, + f"async marked all {self.LARGE_N} duplicates at response time " + "-> it unexpectedly blocked (background dedupe should not have finished)", + ) def add_dedupe_tests_to_suite(suite, *, jira=False, github=False, block_execution=False): @@ -648,13 +744,6 @@ def add_dedupe_tests_to_suite(suite, *, jira=False, github=False, block_executio suite.addTest(DedupeTest("test_delete_findings")) suite.addTest(DedupeTest("test_import_service")) suite.addTest(DedupeTest("test_check_service")) - # 'async_wait' execution mode: the import request blocks until dedupe - # completes, so duplicates are visible immediately (no polling needed). - suite.addTest(DedupeTest("test_set_async_wait_mode")) - suite.addTest(DedupeTest("test_delete_findings")) - suite.addTest(DedupeTest("test_add_async_wait_test_suite")) - suite.addTest(DedupeTest("test_import_async_wait_tests")) - suite.addTest(DedupeTest("test_reset_dedup_mode")) # Clean up suite.addTest(ProductTest("test_delete_product")) return suite @@ -664,6 +753,10 @@ def suite(): suite = unittest.TestSuite() add_dedupe_tests_to_suite(suite, jira=False, github=False, block_execution=False) add_dedupe_tests_to_suite(suite, jira=True, github=True, block_execution=True) + # Deterministic real-worker guard for 'async_wait' (independent of jira/github, + # so added once rather than per add_dedupe_tests_to_suite run). + suite.addTest(ImportAsyncWaitApiTest("test_async_wait_blocks_until_dedupe_complete")) + suite.addTest(ImportAsyncWaitApiTest("test_async_does_not_block")) return suite From d414dacbbe8631e0f88ac8513715a75d2d78c8da Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 26 Jun 2026 23:21:47 +0200 Subject: [PATCH 18/29] test(dedupe): make async_wait integration test deterministic via gated dedup delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The giant-report approach was non-deterministic in CI: deduplication is dispatched per batch during the import, so on a fast worker plain `async` finishes dedup before the response and marks all findings too. The count discriminator then couldn't tell async_wait from async (the control failed `400 != 400`), and a broken async_wait could false-pass. Add DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY (default 0, no-op in production), set to 3s on the integration-test celeryworker only. Each dedup batch sleeps before doing work, so async_wait blocks past it (all duplicates marked) while async returns mid-delay (none marked) — independent of worker speed. Rewrite ImportAsyncWaitApiTest to a small report and assert async marks exactly 0. Verified locally: green normally; fails `0 != 25` when the join fix is reverted. --- docker-compose.override.integration_tests.yml | 4 ++ dojo/finding/helper.py | 8 ++- dojo/settings/settings.dist.py | 6 ++ tests/dedupe_test.py | 70 ++++++++++--------- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/docker-compose.override.integration_tests.yml b/docker-compose.override.integration_tests.yml index a281ae880a8..cfa2e68dbd1 100644 --- a/docker-compose.override.integration_tests.yml +++ b/docker-compose.override.integration_tests.yml @@ -46,6 +46,10 @@ services: environment: DD_DATABASE_URL: ${DD_TEST_DATABASE_URL:-postgresql://defectdojo:defectdojo@postgres:5432/test_defectdojo} DD_V3_FEATURE_LOCATIONS: ${DD_V3_FEATURE_LOCATIONS:-False} + # Delay each deduplication batch so the async_wait integration test can + # deterministically distinguish a blocking join (async_wait) from a + # non-blocking one (async). Integration-test stack only; never in production. + DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY: 3 initializer: environment: PYTHONWARNINGS: error # We are strict about Warnings during testing diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index d83d032b176..975099c387d 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -2,7 +2,7 @@ from contextlib import suppress from datetime import datetime from itertools import batched -from time import strftime +from time import sleep, strftime from django.conf import settings from django.db import transaction @@ -470,6 +470,12 @@ def post_process_findings_batch( force_sync=False, **kwargs, ): + # Test-only hook: when DEDUPLICATION_BATCH_PROCESS_TEST_DELAY > 0 (set only in + # the integration-test stack) block this batch so integration tests can + # deterministically distinguish 'async_wait' (which joins on this task) from + # 'async' (which does not). Default 0 -> no effect in production. + if (test_delay := settings.DEDUPLICATION_BATCH_PROCESS_TEST_DELAY) > 0: + sleep(test_delay) logger.debug( f"post_process_findings_batch called: finding_ids_count={len(finding_ids) if finding_ids else 0}, " diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py index 10f14d1627d..95411a710c8 100644 --- a/dojo/settings/settings.dist.py +++ b/dojo/settings/settings.dist.py @@ -99,6 +99,11 @@ # Max seconds the 'async_wait' deduplication execution mode will wait for # background deduplication/post-processing to finish before responding anyway. DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT=(int, 60), + # Test-only: artificial delay (seconds) injected at the start of + # post_process_findings_batch so integration tests can deterministically + # observe that 'async_wait' blocks on deduplication while 'async' does not. + # Must stay 0 in production. + DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY=(int, 0), DD_CELERY_RESULT_BACKEND=(str, "django-db"), DD_CELERY_RESULT_EXPIRES=(int, 86400), DD_CELERY_BEAT_SCHEDULE_FILENAME=(str, root("dojo.celery.beat.db")), @@ -868,6 +873,7 @@ def generate_url(scheme, double_slashes, user, password, host, port, path, param ) CELERY_TASK_IGNORE_RESULT = env("DD_CELERY_TASK_IGNORE_RESULT") DEDUPLICATION_ASYNC_WAIT_TIMEOUT = env("DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT") +DEDUPLICATION_BATCH_PROCESS_TEST_DELAY = env("DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY") CELERY_RESULT_BACKEND = env("DD_CELERY_RESULT_BACKEND") CELERY_TIMEZONE = TIME_ZONE CELERY_RESULT_EXPIRES = env("DD_CELERY_RESULT_EXPIRES") diff --git a/tests/dedupe_test.py b/tests/dedupe_test.py index 97009ee4c77..a66b0f85ade 100644 --- a/tests/dedupe_test.py +++ b/tests/dedupe_test.py @@ -517,33 +517,32 @@ class ImportAsyncWaitApiTest(unittest.TestCase): """ Deterministic API test for the import 'async_wait' deduplication mode. - The Selenium async_wait test above is only a UI smoke test: with 2 findings - and several page navigations between the import and the duplicate check, the - background dedupe finishes regardless of whether the import actually blocked, - so it cannot fail when the cross-process join is broken. The eager unit tests - (CELERY_TASK_ALWAYS_EAGER) and the mocked perf test can't catch it either. + The Selenium async_wait test (removed) was only a UI smoke test: with few + findings and several page navigations between the import and the duplicate + check, the background dedupe finished regardless of whether the import + actually blocked, so it could not fail when the cross-process join is broken. + The eager unit tests (CELERY_TASK_ALWAYS_EAGER) and the mocked perf test can't + catch it either. This test runs against the real docker-compose stack: a separate celeryworker process + broker, with the global CELERY_TASK_IGNORE_RESULT in effect. It is the only coverage that can fail when the join is a no-op. - How it stays deterministic without timing hacks or production test hooks: - - It imports a *large* report (LARGE_N findings) twice into a dedup-on- - engagement engagement, so the second import's findings deduplicate against - the first. - - Background deduplication of LARGE_N findings takes ~1s of real worker time, - far longer than the single API round-trip used to observe the result. - - 'async_wait' must block until that work finishes, so duplicates are ALL - marked by the time the import response returns -> observed count == LARGE_N. - - plain 'async' does not block, so at response time essentially none are - marked yet -> observed count < LARGE_N (and deduplication_complete False). - A broken/no-op join makes 'async_wait' behave exactly like 'async' (count < - LARGE_N), which fails this test. + Determinism comes from DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY, set on the + integration-test celeryworker (see docker-compose.override.integration_tests.yml): + each deduplication batch sleeps a few seconds before doing any work. So: + - 'async_wait' blocks on the worker round-trip, so by the time the import + response returns every duplicate is already marked -> count == NUM_FINDINGS. + - plain 'async' returns immediately while the dedup task is still sleeping, + so at response time NONE are marked yet -> count == 0. + A broken/no-op join makes 'async_wait' behave exactly like 'async' (count 0), + which fails this test. The injected delay makes that distinction independent of + worker speed (a plain large-report race would not, since dedup overlaps the + import and can finish before the response on a fast worker). """ - # Large enough that real background dedup cannot finish within the single - # observation round-trip, but small enough to keep the test fast (~seconds). - LARGE_N = 400 + # Small: determinism comes from the worker-side dedup delay, not report size. + NUM_FINDINGS = 25 @classmethod def setUpClass(cls): @@ -607,10 +606,10 @@ def _make_engagement(self, name): def _generic_report(self): """ - A Generic Findings Import report of LARGE_N unique findings. + A Generic Findings Import report of NUM_FINDINGS unique findings. Re-importing the identical content into a second test of the same - dedup-on-engagement engagement marks all LARGE_N as duplicates. + dedup-on-engagement engagement marks all NUM_FINDINGS as duplicates. """ findings = [ { @@ -618,7 +617,7 @@ def _generic_report(self): "severity": "High", "description": f"async_wait dedup finding number {i}", } - for i in range(self.LARGE_N) + for i in range(self.NUM_FINDINGS) ] return json.dumps({"findings": findings}) @@ -660,22 +659,23 @@ def test_async_wait_blocks_until_dedupe_complete(self): body.get("deduplication_complete"), f"async_wait did not report deduplication_complete: {body}", ) - # No sleep/retry: async_wait must have blocked until dedupe finished, so - # every finding in the second import is already marked duplicate. + # No sleep/retry: async_wait must have blocked past the worker-side dedup + # delay until dedupe finished, so every finding is already marked duplicate. marked = self._duplicates_marked(test_id) self.assertEqual( - marked, self.LARGE_N, - f"async_wait returned with only {marked}/{self.LARGE_N} duplicates marked " + marked, self.NUM_FINDINGS, + f"async_wait returned with only {marked}/{self.NUM_FINDINGS} duplicates marked " "-> the cross-process join did not block until deduplication finished", ) def test_async_does_not_block(self): """ - Control: plain async returns before dedupe -> incomplete + few marked. + Control: plain async returns while dedupe is still running -> none marked. Proves the assertion above is meaningful: with a non-blocking dispatch the - duplicates are NOT all marked at response time. A broken async_wait would - look exactly like this and so fail the test above. + duplicates are NOT marked at response time (the dedup task is still in its + worker-side delay). A broken async_wait would look exactly like this and so + fail the test above. """ engagement_id = self._make_engagement("async") self._import(engagement_id, "async") @@ -685,11 +685,13 @@ def test_async_does_not_block(self): body.get("deduplication_complete"), f"async unexpectedly reported deduplication_complete: {body}", ) + # The worker-side delay guarantees the dedup task has not finished yet, so + # nothing is marked at response time regardless of worker speed. marked = self._duplicates_marked(test_id) - self.assertLess( - marked, self.LARGE_N, - f"async marked all {self.LARGE_N} duplicates at response time " - "-> it unexpectedly blocked (background dedupe should not have finished)", + self.assertEqual( + marked, 0, + f"async marked {marked} duplicates at response time " + "-> it unexpectedly blocked (background dedupe should still be delayed)", ) From be294ab3857ff14d7e296558979490c3000e011e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 27 Jun 2026 10:06:50 +0200 Subject: [PATCH 19/29] test(dedupe): scope async_wait dedup delay to the test's findings, widen margin The first delay attempt broke two ways in CI: - It delayed *every* dedup batch in the UI suite, breaking the timing-sensitive close_old_findings_dedupe_test. - 3s was too short: the async control's import + observation GET took longer than that on CI, so dedup finished first and the control marked all findings. Add DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER (a finding-title prefix, case-insensitive) so only the async_wait test's own findings are delayed -- other dedupe tests run at full speed. Bump the delay to 10s for margin over the import round-trip. The Generic importer title-cases findings, so the match uses istartswith. Verified locally against the real worker: test_wait now blocks ~10s and marks all 25; async marks 0 at response time; reverting the join fix fails 0 != 25. --- docker-compose.override.integration_tests.yml | 8 +-- dojo/finding/helper.py | 12 +++-- dojo/settings/settings.dist.py | 5 +- dojo/sql_stacktrace.py | 51 ------------------- 4 files changed, 17 insertions(+), 59 deletions(-) delete mode 100644 dojo/sql_stacktrace.py diff --git a/docker-compose.override.integration_tests.yml b/docker-compose.override.integration_tests.yml index cfa2e68dbd1..6ee96a896c4 100644 --- a/docker-compose.override.integration_tests.yml +++ b/docker-compose.override.integration_tests.yml @@ -46,10 +46,12 @@ services: environment: DD_DATABASE_URL: ${DD_TEST_DATABASE_URL:-postgresql://defectdojo:defectdojo@postgres:5432/test_defectdojo} DD_V3_FEATURE_LOCATIONS: ${DD_V3_FEATURE_LOCATIONS:-False} - # Delay each deduplication batch so the async_wait integration test can + # Delay deduplication batches so the async_wait integration test can # deterministically distinguish a blocking join (async_wait) from a - # non-blocking one (async). Integration-test stack only; never in production. - DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY: 3 + # non-blocking one (async). Scoped by _FILTER to that test's findings so + # other dedupe tests are unaffected. Integration-test stack only; never prod. + DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY: 10 + DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER: "async_wait finding" initializer: environment: PYTHONWARNINGS: error # We are strict about Warnings during testing diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index 975099c387d..b8ba9e6d2c5 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -471,11 +471,15 @@ def post_process_findings_batch( **kwargs, ): # Test-only hook: when DEDUPLICATION_BATCH_PROCESS_TEST_DELAY > 0 (set only in - # the integration-test stack) block this batch so integration tests can - # deterministically distinguish 'async_wait' (which joins on this task) from - # 'async' (which does not). Default 0 -> no effect in production. + # the integration-test stack) block this batch so the async_wait integration + # test can deterministically distinguish 'async_wait' (which joins on this + # task) from 'async' (which does not). Default 0 -> no effect in production. + # DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER (a finding-title prefix) scopes + # the delay to that one test's findings so unrelated dedupe tests are not slowed. if (test_delay := settings.DEDUPLICATION_BATCH_PROCESS_TEST_DELAY) > 0: - sleep(test_delay) + delay_filter = settings.DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER + if not delay_filter or Finding.objects.filter(id__in=finding_ids, title__istartswith=delay_filter).exists(): + sleep(test_delay) logger.debug( f"post_process_findings_batch called: finding_ids_count={len(finding_ids) if finding_ids else 0}, " diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py index 95411a710c8..d1b661e095e 100644 --- a/dojo/settings/settings.dist.py +++ b/dojo/settings/settings.dist.py @@ -102,8 +102,10 @@ # Test-only: artificial delay (seconds) injected at the start of # post_process_findings_batch so integration tests can deterministically # observe that 'async_wait' blocks on deduplication while 'async' does not. - # Must stay 0 in production. + # Must stay 0 in production. The _FILTER (a finding-title prefix) scopes the + # delay to a single test's findings so unrelated dedupe tests are not slowed. DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY=(int, 0), + DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER=(str, ""), DD_CELERY_RESULT_BACKEND=(str, "django-db"), DD_CELERY_RESULT_EXPIRES=(int, 86400), DD_CELERY_BEAT_SCHEDULE_FILENAME=(str, root("dojo.celery.beat.db")), @@ -874,6 +876,7 @@ def generate_url(scheme, double_slashes, user, password, host, port, path, param CELERY_TASK_IGNORE_RESULT = env("DD_CELERY_TASK_IGNORE_RESULT") DEDUPLICATION_ASYNC_WAIT_TIMEOUT = env("DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT") DEDUPLICATION_BATCH_PROCESS_TEST_DELAY = env("DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY") +DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER = env("DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER") CELERY_RESULT_BACKEND = env("DD_CELERY_RESULT_BACKEND") CELERY_TIMEZONE = TIME_ZONE CELERY_RESULT_EXPIRES = env("DD_CELERY_RESULT_EXPIRES") diff --git a/dojo/sql_stacktrace.py b/dojo/sql_stacktrace.py deleted file mode 100644 index eeb44385f46..00000000000 --- a/dojo/sql_stacktrace.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -Temporary debugging helper: django-sql-stacktrace-style execute_wrapper. - -Enabled only when the env var DD_SQL_STACKTRACE is set. When active it -registers a connection.execute_wrapper on every DB connection that, for any -SQL matching DD_SQL_STACKTRACE_FILTER (default: 'pro_enhanced_finding'), -prints the originating Python traceback to stderr. Used to pinpoint the exact -call site emitting the per-finding deduplication queries in the importer perf -tests. NOT for production — remove before merge. -""" -import os -import sys -import traceback - -from django.db.backends.signals import connection_created -from django.dispatch import receiver - -_FILTER = os.environ.get("DD_SQL_STACKTRACE_FILTER", "pro_enhanced_finding") -_counter = {"n": 0} - - -def _stacktrace_wrapper(execute, sql, params, many, context): - if _FILTER in sql: - _counter["n"] += 1 - stack = "".join(traceback.format_stack()[:-1]) - sys.stderr.write( - f"\n===== DD_SQL_STACKTRACE hit #{_counter['n']} (filter={_FILTER!r}) =====\n" - f"SQL: {sql[:200]}\n" - f"{stack}" - f"===== end DD_SQL_STACKTRACE hit #{_counter['n']} =====\n", - ) - sys.stderr.flush() - return execute(sql, params, many, context) - - -@receiver(connection_created) -def _install_wrapper(sender, connection, **kwargs): - # execute_wrappers persist for the life of the connection; guard against - # double-registration if the signal fires more than once for a connection. - if getattr(connection, "_dd_sql_stacktrace_installed", False): - return - connection._dd_sql_stacktrace_installed = True - connection.execute_wrappers.append(_stacktrace_wrapper) - - -def maybe_enable(): - """Touch the module so the connection_created receiver is connected. - - Returns True when the DD_SQL_STACKTRACE env var requested activation. - """ - return bool(os.environ.get("DD_SQL_STACKTRACE")) From b60e44092f4915e5e55a3302cd44b2f48811e2e9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 27 Jun 2026 10:23:39 +0200 Subject: [PATCH 20/29] test(dedupe): log WARN when the test-only dedup delay fires Diagnostic for CI: surfaces exactly which post_process_findings_batch invocations hit the integration-test delay (and the finding count + filter), to confirm the async_wait test's batches are being delayed as intended. --- dojo/finding/helper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index b8ba9e6d2c5..8ce40ed6765 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -479,6 +479,10 @@ def post_process_findings_batch( if (test_delay := settings.DEDUPLICATION_BATCH_PROCESS_TEST_DELAY) > 0: delay_filter = settings.DEDUPLICATION_BATCH_PROCESS_TEST_DELAY_FILTER if not delay_filter or Finding.objects.filter(id__in=finding_ids, title__istartswith=delay_filter).exists(): + logger.warning( + "post_process_findings_batch: TEST-ONLY delay of %ss for %d finding(s) (filter=%r)", + test_delay, len(finding_ids) if finding_ids else 0, delay_filter, + ) sleep(test_delay) logger.debug( From 774e8b4dec6f038d313b25cc28e6fb464c0597ce Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 27 Jun 2026 10:41:02 +0200 Subject: [PATCH 21/29] test(dedupe): assert only deduplication_complete for the async control The async control's marked-count assertion was non-deterministic: its post_process_findings_batch task races the import request's own DB commit, so on CI the findings were not visible when the delay filter ran (no sleep) yet were deduplicated moments later -> the control saw all 25 marked and failed. Drop the count assertion for async and keep only deduplication_complete is False, which is deterministic and mode-based. The duplicate-count guarantee stays on the async_wait test, where the worker-side delay (confirmed firing via the WARN log: 10s/25 findings per batch) makes it robust and a no-op join still fails 0 != NUM_FINDINGS. --- tests/dedupe_test.py | 46 +++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/dedupe_test.py b/tests/dedupe_test.py index a66b0f85ade..0893bf9b034 100644 --- a/tests/dedupe_test.py +++ b/tests/dedupe_test.py @@ -530,15 +530,18 @@ class ImportAsyncWaitApiTest(unittest.TestCase): Determinism comes from DD_DEDUPLICATION_BATCH_PROCESS_TEST_DELAY, set on the integration-test celeryworker (see docker-compose.override.integration_tests.yml): - each deduplication batch sleeps a few seconds before doing any work. So: - - 'async_wait' blocks on the worker round-trip, so by the time the import - response returns every duplicate is already marked -> count == NUM_FINDINGS. - - plain 'async' returns immediately while the dedup task is still sleeping, - so at response time NONE are marked yet -> count == 0. - A broken/no-op join makes 'async_wait' behave exactly like 'async' (count 0), - which fails this test. The injected delay makes that distinction independent of - worker speed (a plain large-report race would not, since dedup overlaps the - import and can finish before the response on a fast worker). + each deduplication batch for this test's findings sleeps a few seconds before + doing any work. 'async_wait' blocks on the worker round-trip, so by the time + the import response returns the delayed batch has finished and every duplicate + is already marked -> count == NUM_FINDINGS. A broken/no-op join would not block, + so the count would still be 0 at response time -> the test fails. The injected + delay makes that distinction independent of worker speed (a plain large-report + race would not, since dedup overlaps the import and can finish before the + response on a fast worker). + + The 'async' counterpart only asserts deduplication_complete is False: its + worker task races the import's own DB commit, so the marked count at response + time is not deterministic -- the count guarantee is asserted on async_wait. """ # Small: determinism comes from the worker-side dedup delay, not report size. @@ -670,29 +673,24 @@ def test_async_wait_blocks_until_dedupe_complete(self): def test_async_does_not_block(self): """ - Control: plain async returns while dedupe is still running -> none marked. - - Proves the assertion above is meaningful: with a non-blocking dispatch the - duplicates are NOT marked at response time (the dedup task is still in its - worker-side delay). A broken async_wait would look exactly like this and so - fail the test above. + Control: plain async must NOT report deduplication as complete. + + Counterpart to the async_wait test: the same import in 'async' mode does + not await deduplication, so its response reports deduplication_complete + False. (We deliberately do not assert the duplicate count here: the async + worker task races the import's own DB commit, so how many are marked at + response time is not deterministic -- the meaningful, deterministic signal + is the flag. The duplicate-count guarantee is asserted on async_wait above, + where the worker-side delay makes it robust.) """ engagement_id = self._make_engagement("async") self._import(engagement_id, "async") - body, test_id = self._import(engagement_id, "async") + body, _test_id = self._import(engagement_id, "async") self.assertFalse( body.get("deduplication_complete"), f"async unexpectedly reported deduplication_complete: {body}", ) - # The worker-side delay guarantees the dedup task has not finished yet, so - # nothing is marked at response time regardless of worker speed. - marked = self._duplicates_marked(test_id) - self.assertEqual( - marked, 0, - f"async marked {marked} duplicates at response time " - "-> it unexpectedly blocked (background dedupe should still be delayed)", - ) def add_dedupe_tests_to_suite(suite, *, jira=False, github=False, block_execution=False): From 3661b0bd3ce48fff2037c3ccc79b47b2e221d2f6 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 28 Jun 2026 21:22:28 +0200 Subject: [PATCH 22/29] test(dedupe): note async_wait eager test is not a real guard Document that test_import_async_wait_returns_statistics cannot fail when the cross-process join is broken (eager .get() always returns inline), pointing to the real-worker ImportAsyncWaitApiTest. Kept for endpoint/field plumbing coverage and future reference. --- unittests/test_import_execution_mode.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unittests/test_import_execution_mode.py b/unittests/test_import_execution_mode.py index e575b987c01..b240056a2af 100644 --- a/unittests/test_import_execution_mode.py +++ b/unittests/test_import_execution_mode.py @@ -150,6 +150,13 @@ def _payload(self, mode): } def test_import_async_wait_returns_statistics(self): + # NOTE: this assertion is not actually meaningful under CELERY_TASK_ALWAYS_EAGER: + # eager .get() returns an inline EagerResult, so deduplication_complete is True + # whether or not the real cross-process join works. It cannot fail when the join + # is broken. The genuine guarantee (async_wait blocks until dedupe finishes) is + # covered by the real-worker integration test, ImportAsyncWaitApiTest in + # tests/dedupe_test.py. Kept here only for endpoint/field plumbing coverage and + # future reference if this ever runs against a non-eager worker. with (get_unit_tests_path() / "scans/zap/0_zap_sample.xml").open(encoding="utf-8") as testfile: payload = self._payload(DEDUPLICATION_EXECUTION_MODE_ASYNC_WAIT) payload["file"] = testfile From 7ba4f039674d85da70da07e8d0be26a98d365573 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 28 Jun 2026 21:24:27 +0200 Subject: [PATCH 23/29] docs(dedupe): document import/reimport deduplication execution mode Extend the deduplication 'Background processing' section with the new deduplication_execution_mode (async / async_wait / sync), the per-profile and per-request override, the deduplication_complete response field, the DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT bound, and its relationship to the global block_execution flag. Previously this was only in the 2.60 upgrade note. --- .../finding_deduplication/about_deduplication.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/content/triage_findings/finding_deduplication/about_deduplication.md b/docs/content/triage_findings/finding_deduplication/about_deduplication.md index 7050711dcd1..e8e3cc2ee6a 100644 --- a/docs/content/triage_findings/finding_deduplication/about_deduplication.md +++ b/docs/content/triage_findings/finding_deduplication/about_deduplication.md @@ -108,6 +108,18 @@ The endpoints also have to match for the findings to be considered duplicates, s - Dedupe is triggered on import/reimport and during certain updates run via Celery in the background. +### Import/reimport deduplication execution mode + +For import and reimport you can control how deduplication post-processing is dispatched and whether the API response waits for it. Set it per user on the profile page (**Deduplication execution mode**), or override it per request with the `deduplication_execution_mode` field on the import/reimport endpoints (the request value takes precedence over the profile). + +- `async` (default): deduplication and the rest of post-processing run in the background and the response returns immediately. Historical behavior; the response is produced before findings are deduplicated. +- `async_wait`: post-processing is still dispatched to the background, but the request waits for deduplication to finish before responding. The `scan_added` notification and the statistics in the response then reflect the deduplicated state (findings that turned out to be duplicates are no longer counted/listed as new). JIRA push, product grading and other non-deduplication tasks remain asynchronous and are not awaited. The wait is bounded by `DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT` (default `60` seconds); if no worker picks up the work in time, the request responds anyway rather than hanging. +- `sync`: import deduplication runs inline in the web request. + +The import/reimport response includes a `deduplication_complete` boolean indicating whether deduplication had finished by the time the response was produced (`true` for `sync` and for a completed `async_wait`, `false` for `async`). + +This is independent of the global `block_execution` profile flag, which forces **all** of a user's asynchronous tasks (notifications, JIRA push, product grading, deduplication, ...) to the foreground. When no execution mode is set, `block_execution=True` falls back to `sync`. + ## Service field and its impact - By default, `HASH_CODE_FIELDS_ALWAYS = ["service"]`, meaning the `service` associated with a finding is appended to the hash for all scanners. From 3fdc68b6814253004524edbca263214bcf04734c Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 28 Jun 2026 21:42:27 +0200 Subject: [PATCH 24/29] feat(importers): log async_wait deduplication wait duration Record how long the import request blocked on the deduplication join: - DEBUG line with elapsed seconds + task count + success on completion. - the timeout/error WARNING now includes the elapsed time before it gave up. Timing detail is DEBUG so it does not add noise per import in production; the timeout case stays at WARNING since it signals a degraded (respond-anyway) join. --- dojo/importers/base_importer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index bc24538fd78..1df5ad6d931 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -145,6 +145,7 @@ def wait_for_post_processing(self): return timeout = getattr(settings, "DEDUPLICATION_ASYNC_WAIT_TIMEOUT", 60) logger.debug("async_wait: waiting for %d post-processing task(s) (timeout=%ss)", len(results), timeout) + start = time.monotonic() success = True for result in results: if result is None or not hasattr(result, "get"): @@ -152,8 +153,13 @@ def wait_for_post_processing(self): try: result.get(timeout=timeout, propagate=False) except Exception as e: - logger.warning("async_wait: error/timeout while waiting for post-processing task: %s", e) + logger.warning( + "async_wait: error/timeout after %.2fs waiting for post-processing task: %s", + time.monotonic() - start, e, + ) success = False + elapsed = time.monotonic() - start + logger.debug("async_wait: waited %.2fs for %d post-processing task(s) (success=%s)", elapsed, len(results), success) self.deduplication_complete = success self.post_processing_results = [] From ecb1b4f25be8f02b670207897f866883910de1c1 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 28 Jun 2026 21:58:45 +0200 Subject: [PATCH 25/29] chore: stop tracking local-only files committed by mistake CLAUDE.local.md, fix-jira-push-eligibility.md, phase10-session-prompt.md, scripts/run-nuclei-docker.sh and tagoulus-replcement-options.md were accidentally added in an earlier refactor commit; none belong in this feature. Remove them from version control (kept on disk locally) and gitignore CLAUDE.local.md / CLAUDE.md so they cannot be re-added. --- .gitignore | 2 + CLAUDE.local.md | 36 ---- fix-jira-push-eligibility.md | 111 ------------ phase10-session-prompt.md | 69 ------- scripts/run-nuclei-docker.sh | 40 ----- tagoulus-replcement-options.md | 319 --------------------------------- 6 files changed, 2 insertions(+), 575 deletions(-) delete mode 100644 CLAUDE.local.md delete mode 100644 fix-jira-push-eligibility.md delete mode 100644 phase10-session-prompt.md delete mode 100644 scripts/run-nuclei-docker.sh delete mode 100644 tagoulus-replcement-options.md diff --git a/.gitignore b/.gitignore index 1cdf995c5f4..857df7a34da 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,5 @@ docs/.hugo_build.lock # claude etc MEMORY.md .claude/ +CLAUDE.md +CLAUDE.local.md diff --git a/CLAUDE.local.md b/CLAUDE.local.md deleted file mode 100644 index 584b0167d4d..00000000000 --- a/CLAUDE.local.md +++ /dev/null @@ -1,36 +0,0 @@ -# Claude Code Instructions - -Follow all guidelines in [AGENTS.md](AGENTS.md). Key points repeated here for emphasis: - -## Pull Request Descriptions - -- Include a **Summary** section only — bullet points describing what changed and why. -- **Do NOT add any of the following sections**: "Test plan", "Testing", "How to test", or any other testing-related section. DefectDojo's PR template does not use one. -- **NEVER overwrite an existing PR description** without first reading it with `gh pr view --json body -q '.body'`. -- **NEVER mention DefectDojo Pro** in PRs, issues, or comments targeting the open source repo. -- **Always edit PR bodies via a file**, never inline. `gh pr edit --body "$(cat <<'EOF' ... EOF)"` silently exits 1 on this repo because the Projects (classic) GraphQL deprecation warning trips gh's error path. Use one of: - - `gh pr edit --body-file /tmp/pr_body.md` - - `gh api -X PATCH /repos///pulls/ --input ` (REST, fully bypasses the GraphQL warning) - Always verify with `gh pr view --json body -q '.body'` after — `gh pr edit` may print no error but still leave the body unchanged. - -## PR URLs - -- Always format PR URLs as markdown links: `[PR #123](https://github.com/owner/repo/pull/123)` - -## Git Commits - -- **NEVER commit `CLAUDE.md`** — it is a local instruction file, not part of the upstream codebase. -- **NEVER commit files from `.claude/`** — these are local session artifacts. - -## Running Unit Tests - -- **Always use `./run-unittest.sh`** to run unit tests — never `python -m pytest` or `python manage.py test` directly: - ```bash - ./run-unittest.sh --test-case unittests.test_module.TestClass 2>&1 | tee /tmp/test_output.log - ``` -- **ALWAYS pipe output through `tee` to capture it to a file.** This is mandatory — it lets you grep the results without re-running expensive tests. -- After running, analyze with `grep` on the captured file, not by re-running: - ```bash - grep -E "PASSED|FAILED|ERROR|error" /tmp/test_output.log - ``` -- **Narrate every test iteration**: report pass/fail immediately after each run, explain what failed and what you plan to fix before making changes, then explain what you changed before re-running. diff --git a/fix-jira-push-eligibility.md b/fix-jira-push-eligibility.md deleted file mode 100644 index d9ba4dee1fb..00000000000 --- a/fix-jira-push-eligibility.md +++ /dev/null @@ -1,111 +0,0 @@ -# Fix: JIRA push eligibility check blocks unrelated PATCH requests - -## Bug - -PATCH requests on findings (e.g. EPSS enrichment) fail with HTTP 400: - -``` -"Finding: X cannot be pushed to JIRA: Findings must be active and verified, - if enforced by system settings, to be pushed to JIRA." -``` - -The EPSS update is rolled back due to `ATOMIC_REQUESTS`. - -## Root cause - -Two PRs introduced between 2.52.3 and 2.55.4 combined to cause this: - -### PR #14262 — "Jira keep findings in sync" (2.55.2) - -`FindingSerializer.update()` in `dojo/api_v2/serializers.py` was changed to -trigger a JIRA push even when `push_to_jira` was not explicitly set: - -```python -# before -if push_to_jira: - jira_helper.push_to_jira(instance) - -# after -if push_to_jira or finding_helper.is_keep_in_sync_with_jira(instance): - jira_helper.push_to_jira(instance, sync=True) -``` - -This means any PATCH — including EPSS enrichment — triggers a JIRA push when: -- `jira_project.push_all_issues` is True (resolved in `perform_update` before - calling `serializer.save`), OR -- `finding_jira_sync` is enabled on the JIRA instance and the finding (or its - group) already has a JIRA issue - -### PR #14320 — "Fix Jira error handling" (2.55.3) - -`push_to_jira` return value (previously ignored) is now checked, and any -failure raises a `ValidationError`: - -```python -# after -success, message = jira_helper.push_to_jira(instance, sync=True) -if not success: - raise serializers.ValidationError(message) -``` - -`add_jira_issue` in `helper.py` already classifies "not active/verified" and -"below threshold" as *expected* eligibility failures and logs them at INFO -level — but it still returns `(False, message)`, which the serializer now -unconditionally raises as a 400. - -## Existing pattern - -`dojo/finding/views.py` (bulk edit, lines ~2963-3024) already does this -correctly: it calls `can_be_pushed_to_jira` as a guard before attempting the -push. If ineligible, it logs and skips — no exception is raised. - -## Fix - -### 1. `dojo/api_v2/views.py` — `FindingViewSet.perform_update()` - -Capture whether the push was *explicitly* requested before merging with -`push_all_issues`: - -```python -push_to_jira = serializer.validated_data.get("push_to_jira") -push_to_jira_explicit = bool(push_to_jira) # explicit before OR -jira_project = jira_helper.get_jira_project(serializer.instance) -if get_system_setting("enable_jira") and jira_project: - push_to_jira = push_to_jira or jira_project.push_all_issues -serializer.save(push_to_jira=push_to_jira, push_to_jira_explicit=push_to_jira_explicit) -``` - -### 2. `dojo/api_v2/serializers.py` — `FindingSerializer.update()` - -Guard with `can_be_pushed_to_jira` (same pattern as bulk edit view). -Only raise `ValidationError` for eligibility failures when the push was -explicitly requested by the caller; auto-triggered pushes skip silently: - -```python -push_to_jira = validated_data.pop("push_to_jira") -push_to_jira_explicit = validated_data.pop("push_to_jira_explicit", False) - -... - -if push_to_jira or finding_helper.is_keep_in_sync_with_jira(instance): - can_push, error_message, _error_code = jira_helper.can_be_pushed_to_jira(instance) - if can_push: - # Push synchronously so that we can see jira errors in real time - success, message = jira_helper.push_to_jira(instance, sync=True) - if not success: - raise serializers.ValidationError(message) - elif push_to_jira_explicit: - # User explicitly asked to push — surface the eligibility error - raise serializers.ValidationError(error_message) - # else: auto-triggered (push_all_issues / finding_jira_sync) but finding - # is not eligible → silently skip, same as bulk edit view -``` - -## Behaviour after fix - -| Trigger | Finding eligible | Result | -|---|---|---| -| Explicit `push_to_jira=true` | yes | push, raise on push failure | -| Explicit `push_to_jira=true` | no | raise 400 (expected, user asked) | -| Auto (`push_all_issues` / sync) | yes | push, raise on push failure | -| Auto (`push_all_issues` / sync) | no | silently skip, PATCH succeeds | diff --git a/phase10-session-prompt.md b/phase10-session-prompt.md deleted file mode 100644 index e98128429d8..00000000000 --- a/phase10-session-prompt.md +++ /dev/null @@ -1,69 +0,0 @@ -Implement "Phase 10: Peripheral Model Modules — 10-PR Stack Continuation" from AGENTS.md. -Read that section first — it is the complete source of truth (bundles, line ranges, -stack/cascade mechanics, gotchas). Also read the Phase 0–9 playbook above it and follow -CLAUDE.local.md (run-unittest.sh + tee, PR body rules, never commit CLAUDE.md/.claude/). - -## Goal -Finish the reorg by moving the ~45 peripheral model classes still in dojo/models.py into -their domain modules, as a full vertical slice (Phases 1–9) per module. Continue the existing -stack on top of reorg/finding-models (#14974). - -## Order (bottom-up, do not parallelize across branches — they all touch the same monoliths) -1. FIRST: fold CWE + BurpRawRequestResponse into dojo/finding/ on the EXISTING branch - reorg/finding-models (#14974). No new PR. Commit, force-push --force-with-lease to upstream. -2. Then PRs 6–10, each branched from its predecessor's tip: - - reorg/peripheral-user (Bundle A) ← reorg/finding-models - - reorg/peripheral-tools-endpoint (Bundle B) ← peripheral-user - - reorg/peripheral-survey-benchmark(Bundle C) ← peripheral-tools-endpoint - - reorg/peripheral-notes-files (Bundle D) ← peripheral-survey-benchmark - - reorg/peripheral-misc (Bundle E) ← peripheral-notes-files - Each: push to UPSTREAM, open as a DRAFT PR (gh pr create --draft --repo - DefectDojo/django-DefectDojo --base --head ). - -## Subagent strategy (cost + context, NOT parallelism) -All modules edit the same monolith files (dojo/models.py, forms.py, filters.py, -api_v2/serializers.py, api_v2/views.py, urls.py, apps.py). NEVER run two module subagents -concurrently — they would clobber each other's edits to those shared files. Process modules -ONE AT A TIME, bottom-up. - -Use Sonnet subagents to offload mechanical bulk work and preserve your (Opus) context: -- You run Phase 0 pre-flight per module (grep consumers — don't trust memorized lists). -- Spawn ONE Sonnet subagent (model: sonnet) per module, sequentially. It does: move models+admin, - add re-export stubs, string-ref cross-bundle FKs, move forms→ui/forms.py, filters→ui/filters.py, - views/urls→ui/, and the API layer if one exists. Give it exact line ranges from AGENTS.md + the - matching template (dojo/finding/, dojo/product/, dojo/test/, dojo/engagement/, or dojo/url// - dojo/location/ for models-only). -- YOU handle the judgment parts: circular-import resolution, serializer get_fields() lazy - cycle-breaks + extend_schema_field set_override, prefetcher full-reexport, shared-base - decisions (FindingTagStringFilter trap). -- After each module: YOU run verify gates (docker manage.py check, makemigrations --check, - ruff check, ./run-unittest.sh --test-case unittests. 2>&1 | tee /tmp/test_.log, - grep the log). A Sonnet agent does not get to declare success — you confirm via gates before - moving on. - -## Per-bundle gotchas (also in AGENTS.md) -- survey & benchmark: NO api_v2 serializers/viewsets → skip Phases 6–9 for them (confirm w/ Phase 0). -- Question/Answer base classes live inside a `with warnings.catch_warnings():` block — preserve it. -- Benchmark_Requirement → M2M CWE: use string ref "dojo.CWE" (CWE moves to finding lower in stack). -- tool_config: move ToolConfigForm_Admin (ModelForm) + Tool_Configuration_Admin (ModelAdmin) - from models.py into dojo/tool_config/admin.py. -- user (Bundle A): Dojo_User is an FK target everywhere — land it first; string-ref consumers. - -## Commit/PR conventions -- One commit per phase group, same style as existing stack (e.g. - "refactor(): extract API layer into dojo//api/ [ Phase 6,7,8,9]"). -- Every PR body (all 10) must carry the 10-item stack map already on #14970–#14974; copy that - block, set the "◀ this PR" marker. Summary section only — NO test-plan section. Read existing - body first (gh pr view --json body -q '.body'); edit via --body-file or - gh api -X PATCH /repos/DefectDojo/django-DefectDojo/pulls/ --input payload.json; verify after. -- After the 5 new draft PRs exist, BACKFILL their real PR numbers into all 10 PR bodies - (replace the "_draft, to be opened_" placeholders for items 6–10). - -## Stack hygiene -- If you edit a lower branch after upper ones exist, cascade: - git rebase --onto up the chain, then force-push all - with --force-with-lease to upstream. -- Use --no-track when creating branches. - -Work bottom-up, ONE module at a time. Stop and report if any verify gate fails and the fix -isn't mechanical. Narrate each module's pass/fail per CLAUDE.local.md. diff --git a/scripts/run-nuclei-docker.sh b/scripts/run-nuclei-docker.sh deleted file mode 100644 index 6a430e8ef9c..00000000000 --- a/scripts/run-nuclei-docker.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -# Run ProjectDiscovery Nuclei in Docker with a persistent templates volume and rate limiting. -# Usage: ./scripts/run-nuclei-docker.sh [nuclei-extra-args...] -# Env: RATE_LIMIT, TEMPLATES_DIR, NUCLEI_IMAGE, UPDATE_TEMPLATES - -set -euo pipefail - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 [nuclei-extra-args...]" >&2 - exit 1 -fi - -TARGET="$1" -shift - -RATE_LIMIT="${RATE_LIMIT:-150}" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -TEMPLATES_DIR="${TEMPLATES_DIR:-${SCRIPT_DIR}/.nuclei-templates}" -IMAGE="${NUCLEI_IMAGE:-projectdiscovery/nuclei:latest}" -# Set to "true" to refresh templates before the scan (slower; use periodically). -UPDATE_TEMPLATES="${UPDATE_TEMPLATES:-false}" - -mkdir -p "${TEMPLATES_DIR}" - -DOCKER_RUN=( - docker run --rm - --pull always - -v "${TEMPLATES_DIR}:/root/nuclei-templates" - "${IMAGE}" - -ud /root/nuclei-templates -) - -if [[ "${UPDATE_TEMPLATES}" == "true" ]]; then - echo "Updating Nuclei templates in ${TEMPLATES_DIR} ..." - "${DOCKER_RUN[@]}" -update-templates - echo "" -fi - -echo "Scanning ${TARGET} (rate limit: ${RATE_LIMIT} req/s) ..." -"${DOCKER_RUN[@]}" -u "${TARGET}" -rate-limit "${RATE_LIMIT}" "$@" diff --git a/tagoulus-replcement-options.md b/tagoulus-replcement-options.md deleted file mode 100644 index 795cb77aea1..00000000000 --- a/tagoulus-replcement-options.md +++ /dev/null @@ -1,319 +0,0 @@ -# Tagulous Replacement Options - -## Context - -DefectDojo uses [django-tagulous](https://github.com/radiac/django-tagulous) (forked locally at `/home/valentijn/django-tagulous/`) for tagging on 9 models. Library is unmaintained upstream. Local fork carries 1 commit (Django 5.2 deserializer monkeypatch). Performance is the primary pain point — signal-driven count maintenance and per-row M2M sync on every save. - -## Current State - -### Tagulous usage in DefectDojo - -- **9 TagFields**, all flat, all `force_lowercase=True`. No trees. No `SingleTagField`. No `max_count`. -- Models: `Product`, `Engagement`, `Test`, `Finding`, `Endpoint`, `FindingTemplate`, `AppAnalysis`, `Objects_Product`, `Location`. -- **`tag.count`** unused outside tests. Pure overhead. -- **`_inherited_tag_names` JSONField** already replaced 5 inherited M2Ms in migration `0265`. Pattern proven internally. -- **`dojo/tag_utils.py`** (526 LOC) bypasses tagulous internals for bulk ops. Depends on `tag_options`, `related_model`, `remote_field.through`. -- **9 tag tables + 9 through-tables + 5 JSON columns** active. - -### Tagulous internals (perf hotspots) - -- **Signals fire on every write**: `pre_save`, `post_save`, `pre_delete`, `post_delete`. `post_save` always calls `reload()` + `add()`/`remove()` even when tags unchanged. -- **`update_count()`**: full requery per tag on raw deserialization. -- **Increment/decrement** uses F-expressions per tag, then refreshes. -- **No setting** to disable count maintenance. -- **Library size**: ~4500 LOC total, ~1800 LOC core. Vendorable. - -### Features used vs unused - -| Feature | Used? | -|---|---| -| Flat M2M tags | Yes | -| `force_lowercase` | Yes | -| Custom DRF parsing (comma/quoted) | Yes | -| Autocomplete widget | Yes (admin + UI) | -| Tag trees (`tree=True`) | No | -| `SingleTagField` | No | -| `max_count` | No | -| `protect_initial` / `protect_all` | No | -| `tag.count` (public) | No | -| Merge/rename admin | Minimal | - -## Options Ranked - -### Option A — JSONField only (drop M2M entirely) — best long-term - -Store `tags = JSONField(default=list)` per object. Index via GIN (Postgres) or functional index. Lookups via `tags__contains=[name]`. - -**Pros** -- No through-tables. No tag table. No signals. No count drift. -- Insert/update = single column write. Bulk = `bulk_update`. -- Matches doc-store model. Inheritance is already JSON. -- GIN-backed lookup fast on Postgres. -- Eliminates 9 tag tables + 9 through-tables. Major schema simplification. - -**Cons** -- No central tag registry. Autocomplete needs `DISTINCT jsonb_array_elements` or small `Tag(name)` registry refreshed async. -- Rename tag = `UPDATE WHERE tags @> ['old']` per affected table. Heavier than tagulous merge. -- MySQL/MariaDB JSON perf weaker than Postgres. Check support matrix (DefectDojo supports both). -- Large data migration: 9 fields × all rows. Batched backfill required. -- Serializer/form/widget rewrite. Lose autocomplete widget unless rebuilt. -- Breaks consumers assuming M2M shape (admin filters, pghistory proxies, external plugins). - -### Option B — Vendor tagulous, gut it — best short-term - -Copy `tagulous/` into `dojo/tagulous/`. Strip: -- `count` field + all `increment`/`decrement`/`try_delete`/`update_count` -- tree code (`TagTreeModel`, ~150 LOC) -- `pre_delete`/`post_delete` for `SingleTagField` (unused) -- raw=True `update_count` loop (signal hotspot) -- `protect_initial`/`protect_all` paths (unused) - -Keep: parser, manager, descriptor, autocomplete widget, migration ops. - -**Pros** -- ~50% LOC reduction. Owned. No upstream coupling. -- Removes biggest perf hits (count maintenance, raw deserialization recount). -- Migrations stay valid (same model shape). Zero data migration. -- `dojo/tag_utils.py` keeps working unchanged. -- Unblocks Django 5.2+ permanently. - -**Cons** -- Still M2M-shaped. Per-write signal overhead still exists, just lighter. -- ~2000 LOC owned by team. -- Doesn't fix root cause (signal-driven sync). Trims fat only. - -### Option C — Hybrid: JSON denorm + keep M2M - -Add `tag_names JSONField` alongside existing M2M. Reads/serializers/filters use JSON. M2M kept for admin/legacy. - -**Pros**: incremental. Low risk per step. Can drop M2M later. - -**Cons**: data duplication on writes. Two sources of truth = sync bugs. Worst-of-both during transition. - -### Option D — Keep fork as-is, add knobs - -Add to fork: `count_disabled=True` option, batch signal mode, skip-redundant-add path. - -**Pros**: minimal change. - -**Cons**: still upstream-shaped library no one else maintains. `tag_utils.py` still depends on internals. Doesn't solve root issue. - -### Option E — Build new lib ("if we built it today") - -Tiny library `django-jsontags`: -- `JSONTagField` = JSONField subclass -- Validators (lowercase, charset, max length) -- Manager-like `.add()/.remove()/.set()` via descriptor -- Lookup helpers wrapping `__contains` for cross-DB compat -- Optional async-refreshed `Tag(name, last_seen)` registry for autocomplete -- Postgres GIN auto-index hint -- DRF field + Form widget (Select2 fed by registry) -- ~500 LOC. No signals. No counts. No trees. - -Open-source as replacement for tagulous. Solves real ecosystem pain. - -## Recommendation - -**Two-step plan**: - -### Step 1 (now, 1–2 sprints): Option B - -Vendor tagulous into `dojo/tagulous/`, strip count + tree + raw-recount + unused protect paths. Removes signal hotspots without data migration. Unblocks Django 5.2+. Keeps `tag_utils.py` working. Low risk. - -Deliverables: -- `dojo/tagulous/` with stripped code -- Pin replaces `tagulous` import path -- Drop `count` column via single migration -- Benchmark: import workflow + bulk-tag ops before/after - -### Step 2 (next quarter): Option A - -Migrate to JSONField, model by model, starting with **Endpoint** (highest write volume from imports) then **Finding** (largest table). Per model: - -1. Add `tag_names JSONField` + GIN index (Postgres) / functional index (MySQL) -2. Backfill in batches via management command -3. Switch reads to JSON (services, serializers, filters, search) -4. Switch writes to JSON via service layer (`tag_utils.py` becomes thin wrapper) -5. Validate parity for one release -6. Drop M2M + tag table - -### Step 3 (optional, after dogfooding): Option E - -Extract `django-jsontags` library from internal code. Free byproduct of step 2. Open-source as tagulous successor. - -## Key Risks - -- **DB matrix**: confirm Postgres + MySQL JSON ops. DefectDojo supports both. Test on MariaDB minimum supported version. -- **Autocomplete UX**: registry table refresh strategy must be designed before any model flips. -- **Audit log (pghistory)**: tags currently tracked via through-table proxy models. JSON column tracks differently. Verify history continuity. -- **External integrations**: any plugin/parser/API consumer assuming M2M shape breaks. Grep `dojo_pro` + external projects. -- **Search ranking**: full-text/Watson over tag names — confirm GIN-backed `@>` matches current ranking behavior. -- **Admin filters**: Django admin `list_filter` on tags loses M2M widget. Replace or drop. -- **Bulk-import perf regression**: validate `tag_utils.py` paths still beat naive M2M during step 1, before step 2 begins. - -## Open Questions - -- Drop MariaDB/MySQL support tier for tag-heavy queries, or build cross-DB lookup helper? -- Move tag registry refresh to Celery task or DB trigger? -- Keep merge/rename admin UI or accept SQL-level rename procedure? -- Include `Tag.description`/`Tag.color` metadata fields in registry, or keep tags pure strings? - ---- - -## Addendum — Design notes from follow-up review - -### `raw=True` on `post_save` - -Django passes `raw=True` when the save originates from `loaddata` (fixture/dump deserialization). Tagulous treats this as "data injected directly into DB, internal counters stale" and calls `tag.update_count()` per tag — full `COUNT(*)` requery for every tag on every fixture-loaded row. Catastrophic on large dumps. Stripping this branch is one of the cheap wins in Option B. - -### Field-change detection (Option A signal) - -Codebase already uses `from fieldsignals import pre_save_changed` (see [dojo/finding/helper.py:16](dojo/finding/helper.py#L16)). No need for `django-model-utils` `FieldTracker` or self-rolled mixin. - -Hook shape: - -```python -from fieldsignals import pre_save_changed - -@receiver(pre_save_changed, sender=Finding, fields=['tags']) -def on_tags_changed(sender, instance, changed_fields, **kwargs): - field = Finding._meta.get_field('tags') - old, new = changed_fields[field] - added = set(new or []) - set(old or []) - removed = set(old or []) - set(new or []) - ... -``` - -### Batch context manager - -Mirror existing `dojo/tag_inheritance.py:batch()` pattern. Thread-local flag suppresses dispatch, accumulates diff, flushes on exit: - -```python -_batch_state = threading.local() - -@contextmanager -def batch_tag_writes(): - _batch_state.active = True - _batch_state.added = set() - _batch_state.removed = set() - try: - yield - finally: - added, removed = _batch_state.added, _batch_state.removed - _batch_state.active = False - if added or removed: - transaction.on_commit(lambda: registry_upsert.delay(list(added), list(removed))) -``` - -Importer wraps bulk loop in `with batch_tag_writes():`. Single dispatch instead of N. - -### Edge cases for any signal-driven path - -- **`bulk_update`/`bulk_create` skip signals.** Bulk paths must call registry update explicitly. `dojo/tag_utils.py` is the right seam — already centralizes bulk. -- **Queryset `.update()` skips signals too.** Same fix. -- **Transaction rollback**: always use `transaction.on_commit(...)` so dispatched task only fires on committed write. Critical to avoid registry pollution from rolled-back imports. -- **Race vs nightly GC**: GC must check current state at run-time, not trust `last_seen` alone. Otherwise concurrent writes lose entries. - ---- - -## Tag Registry — three implementation flavors - -The "registry" = table backing autocomplete and (optionally) per-tag counts. Three valid shapes, ranked. - -### Reg-1 — Maintained table + signals - -`Tag(name PK, last_seen)` updated by `pre_save_changed` signal. Nightly GC removes orphans. - -- **Pros**: cheap read (`name ILIKE 'crit%' LIMIT 20`), realtime visibility of new tags. -- **Cons**: drift under bulk paths, signal plumbing, batch hooks, `on_commit`, GC job. Same maintenance disease as `tag.count` today, in miniature. - -### Reg-2 — On-demand DISTINCT (no registry) - -```sql -SELECT DISTINCT jsonb_array_elements_text(tags) AS name -FROM finding -ORDER BY name LIMIT 20; -``` - -UNION across tagged tables for global autocomplete. - -- **Pros**: zero maintenance. Always current. No signals. -- **Cons**: GIN index does **not** accelerate `DISTINCT jsonb_array_elements_text` — it accelerates containment, not distinct extraction. Needs expression index or `pg_trgm` on extracted values. Slow on Finding/Endpoint without help. -- **Verdict**: viable for small tables only (Product, Engagement). Not for hot tables. - -### Reg-3 — Materialized view ★ recommended - -```sql -CREATE MATERIALIZED VIEW dojo_tag_names AS -SELECT DISTINCT name FROM ( - SELECT jsonb_array_elements_text(tags) AS name FROM finding - UNION - SELECT jsonb_array_elements_text(tags) FROM product - UNION ... -) t; -CREATE UNIQUE INDEX ON dojo_tag_names(name); -CREATE INDEX ON dojo_tag_names USING gin(name gin_trgm_ops); -``` - -Refresh `CONCURRENTLY` on cron (every 5 min). Optional write-path fast-path: `INSERT ... ON CONFLICT DO NOTHING` on tag-create so new tags autocomplete immediately. View still rebuilds nightly to GC removed names. - -- **Pros**: zero signal coupling. Fast autocomplete (trigram). Converges. Single bg job. No per-write maintenance. -- **Cons**: stale window for *removed* tags (acceptable — autocomplete tolerates ghosts). Postgres-only. -- **MariaDB equivalent**: regular table + scheduled `INSERT ... ON DUPLICATE KEY UPDATE` from `SELECT DISTINCT JSON_EXTRACT(...)`. Same shape, manual refresh. - -### Recommendation - -**Reg-3** with optional insert-only fast-path on tag create. Reg-1 only wins if realtime *removal* visible in autocomplete is required (it isn't). Reg-2 only wins on tiny tables. - ---- - -## Counts later (if/when needed) — three flavors - -Same lesson as registry: don't bring M2M back. Counts fit in registry table or a derived view. - -### Count-1 — Incremental column on registry - -`ref_count INTEGER` updated by signal diff. Atomic `F('ref_count') ± 1`. - -- **Pros**: O(diff) per write. Realtime. -- **Cons**: drifts under bulk + rollback. Same maintenance disease as today's `tag.count`. - -### Count-2 — On-demand recount - -```sql -SELECT COUNT(*) FROM finding WHERE tags @> '["critical"]'::jsonb; -``` - -GIN index makes containment fast even on 1M rows. - -- **Pros**: always correct. No coupling. -- **Cons**: O(matching rows) per query unless cached. - -### Count-3 — Materialized view ★ recommended if counts ever needed - -```sql -CREATE MATERIALIZED VIEW dojo_tag_counts AS -SELECT 'finding' AS scope, tag, COUNT(*) AS n -FROM finding, jsonb_array_elements_text(tags) AS tag -GROUP BY tag -UNION ALL ...; -``` - -Refresh nightly or on demand. - -- **Pros**: correct, fast read, no coupling, multi-scope in one query. -- **Cons**: stale until refresh. Postgres CONCURRENTLY needs unique index. MariaDB needs equivalent table + cron. - -### Recommendation - -Start with **no count column at all**. `tag.count` was unused for years. If a real consumer appears (tag cloud, reports), add **Count-3**. Skip Count-1 — its maintenance cost is exactly what Option A is escaping. - ---- - -## Updated decision summary - -- **Field-change detection**: `fieldsignals.pre_save_changed`. Already in tree. -- **Registry**: Reg-3 (materialized view + optional insert-only fast-path). -- **Counts**: skip until proven need. Then Count-3. -- **Batch path**: thread-local context manager + `transaction.on_commit` + explicit hooks in `tag_utils.py` bulk methods. -- **M2M not needed for any of the above.** From 1cd7ad35fb6520178216dedab3b3663ecdbef5e9 Mon Sep 17 00:00:00 2001 From: Cody Maffucci <46459665+Maffooch@users.noreply.github.com> Date: Tue, 30 Jun 2026 13:09:09 -0600 Subject: [PATCH 26/29] fix(migrations): renumber dedup-execution-mode migrations onto current dev leaf dev advanced past the previous 0270/0271 numbering (it now ships 0270_finding_visibility_perf_indexes through 0272_reencrypt_tool_config_credentials_aes_gcm), so the PR's 0270/0271 migrations reintroduced a second leaf node off 0269_normalize_blank_finding_components -> 'Conflicting migrations detected; multiple leaf nodes in the migration graph'. Renumber onto the current leaf: - 0270_usercontactinfo_deduplication_execution_mode -> 0273_..., dep 0272_reencrypt_tool_config_credentials_aes_gcm - 0271_seed_deduplication_execution_mode -> 0274_..., dep 0273_usercontactinfo_deduplication_execution_mode Co-Authored-By: Claude Opus 4.8 (1M context) --- ....py => 0273_usercontactinfo_deduplication_execution_mode.py} | 2 +- ...cution_mode.py => 0274_seed_deduplication_execution_mode.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename dojo/db_migrations/{0270_usercontactinfo_deduplication_execution_mode.py => 0273_usercontactinfo_deduplication_execution_mode.py} (93%) rename dojo/db_migrations/{0271_seed_deduplication_execution_mode.py => 0274_seed_deduplication_execution_mode.py} (94%) diff --git a/dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py b/dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py similarity index 93% rename from dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py rename to dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py index 412bfe2d546..cb57c34b2fe 100644 --- a/dojo/db_migrations/0270_usercontactinfo_deduplication_execution_mode.py +++ b/dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py @@ -4,7 +4,7 @@ class Migration(migrations.Migration): dependencies = [ - ('dojo', '0269_normalize_blank_finding_components'), + ('dojo', '0272_reencrypt_tool_config_credentials_aes_gcm'), ] operations = [ diff --git a/dojo/db_migrations/0271_seed_deduplication_execution_mode.py b/dojo/db_migrations/0274_seed_deduplication_execution_mode.py similarity index 94% rename from dojo/db_migrations/0271_seed_deduplication_execution_mode.py rename to dojo/db_migrations/0274_seed_deduplication_execution_mode.py index 74154304acf..cf9a239c707 100644 --- a/dojo/db_migrations/0271_seed_deduplication_execution_mode.py +++ b/dojo/db_migrations/0274_seed_deduplication_execution_mode.py @@ -22,7 +22,7 @@ def unseed_deduplication_execution_mode(apps, schema_editor): class Migration(migrations.Migration): dependencies = [ - ('dojo', '0270_usercontactinfo_deduplication_execution_mode'), + ('dojo', '0273_usercontactinfo_deduplication_execution_mode'), ] operations = [ From 1d9fcdec2d18e9fc92deb200661f3980584a92d3 Mon Sep 17 00:00:00 2001 From: Cody Maffucci <46459665+Maffooch@users.noreply.github.com> Date: Tue, 30 Jun 2026 14:03:10 -0600 Subject: [PATCH 27/29] test(perf): bump async_wait dedup query baseline to match measured counts CI 'Check counts are up to date' measured 94/74 for test_deduplication_performance_pghistory_async_wait (first/second import) vs the committed 93/73. The web-side delta over plain async (92/72) is +2, not +1: the post-dedup notification refresh SELECT plus one result-backend write from the per-dispatch ignore_result=False that enables the AsyncResult.get() join. Update the baseline and the docstring accordingly. Co-Authored-By: Claude Opus 4.8 (1M context) --- unittests/test_importers_performance.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 8812f7287d4..7a226f089a8 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -559,8 +559,9 @@ def test_deduplication_performance_pghistory_async_wait(self): Deduplication performance in the 'async_wait' execution mode: post-processing is dispatched to a background worker, then the request joins on the result before responding. The dedup queries run in the worker (a separate connection), NOT in - the web request, so the only web-side cost over the plain async path is the single - post-dedup notification refresh SELECT (+1). + the web request, so the only web-side cost over the plain async path is +2: the + post-dedup notification refresh SELECT, plus one result-backend write from the + per-dispatch ignore_result=False that lets the request join via AsyncResult.get(). We do not use CELERY_TASK_ALWAYS_EAGER here — that would run the dispatched task inline on the request's connection and wrongly count the worker's dedup queries. @@ -579,9 +580,9 @@ def test_deduplication_performance_pghistory_async_wait(self): # returns instantly without executing dedup on the request's DB connection. with patch("celery.result.AsyncResult.get", return_value=None): self._deduplication_performance( - expected_num_queries1=93, + expected_num_queries1=94, expected_num_async_tasks1=2, - expected_num_queries2=73, + expected_num_queries2=74, expected_num_async_tasks2=2, dedup_mode="async_wait", check_duplicates=False, From 0533fe7e54a3dab787a642c36e3b148e4f41be6c Mon Sep 17 00:00:00 2001 From: Cody Maffucci <46459665+Maffooch@users.noreply.github.com> Date: Thu, 2 Jul 2026 11:10:57 -0600 Subject: [PATCH 28/29] fix(migrations): renumber dedup-execution-mode migrations onto current dev leaf Rebasing onto dev pulled in 0273_product_upper_name_index, 0274_finding_sla_expiration_index and 0275_usercontactinfo_user_state_details, colliding with this branch's 0273/0274 and producing two leaf nodes. Renumber onto the current leaf: - 0273_usercontactinfo_deduplication_execution_mode -> 0276_..., dep 0275_usercontactinfo_user_state_details - 0274_seed_deduplication_execution_mode -> 0277_..., dep 0276_usercontactinfo_deduplication_execution_mode Co-Authored-By: Claude Opus 4.8 (1M context) --- ....py => 0276_usercontactinfo_deduplication_execution_mode.py} | 2 +- ...cution_mode.py => 0277_seed_deduplication_execution_mode.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename dojo/db_migrations/{0273_usercontactinfo_deduplication_execution_mode.py => 0276_usercontactinfo_deduplication_execution_mode.py} (93%) rename dojo/db_migrations/{0274_seed_deduplication_execution_mode.py => 0277_seed_deduplication_execution_mode.py} (94%) diff --git a/dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py b/dojo/db_migrations/0276_usercontactinfo_deduplication_execution_mode.py similarity index 93% rename from dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py rename to dojo/db_migrations/0276_usercontactinfo_deduplication_execution_mode.py index cb57c34b2fe..3a260b9dd48 100644 --- a/dojo/db_migrations/0273_usercontactinfo_deduplication_execution_mode.py +++ b/dojo/db_migrations/0276_usercontactinfo_deduplication_execution_mode.py @@ -4,7 +4,7 @@ class Migration(migrations.Migration): dependencies = [ - ('dojo', '0272_reencrypt_tool_config_credentials_aes_gcm'), + ('dojo', '0275_usercontactinfo_user_state_details'), ] operations = [ diff --git a/dojo/db_migrations/0274_seed_deduplication_execution_mode.py b/dojo/db_migrations/0277_seed_deduplication_execution_mode.py similarity index 94% rename from dojo/db_migrations/0274_seed_deduplication_execution_mode.py rename to dojo/db_migrations/0277_seed_deduplication_execution_mode.py index cf9a239c707..6577d86946c 100644 --- a/dojo/db_migrations/0274_seed_deduplication_execution_mode.py +++ b/dojo/db_migrations/0277_seed_deduplication_execution_mode.py @@ -22,7 +22,7 @@ def unseed_deduplication_execution_mode(apps, schema_editor): class Migration(migrations.Migration): dependencies = [ - ('dojo', '0273_usercontactinfo_deduplication_execution_mode'), + ('dojo', '0276_usercontactinfo_deduplication_execution_mode'), ] operations = [ From 5acd5f5b86a0b523fbf0d27095c3d4fda21ed656 Mon Sep 17 00:00:00 2001 From: Cody Maffucci <46459665+Maffooch@users.noreply.github.com> Date: Thu, 2 Jul 2026 11:19:34 -0600 Subject: [PATCH 29/29] docs: move deduplication_execution_mode upgrade note to 3.1 The feature now ships in 3.1.0, not 2.60. Revert 2.60.md to its "no special instructions" state and fold the deduplication execution mode section into 3.1.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/content/en/open_source/upgrading/2.60.md | 43 +------------------ docs/content/en/open_source/upgrading/3.1.md | 42 +++++++++++++++++- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/docs/content/en/open_source/upgrading/2.60.md b/docs/content/en/open_source/upgrading/2.60.md index 529dd8ce54c..e7811aa0b99 100644 --- a/docs/content/en/open_source/upgrading/2.60.md +++ b/docs/content/en/open_source/upgrading/2.60.md @@ -2,45 +2,6 @@ title: 'Upgrading to DefectDojo Version 2.60.x' toc_hide: true weight: -20260601 -description: New deduplication execution mode for import/reimport. +description: No special instructions. --- - -## Deduplication execution mode for import/reimport - -This release adds a new `deduplication_execution_mode` setting that controls how -import/reimport deduplication post-processing is dispatched and whether the API -response waits for it. It can be set per user (profile) and overridden per request -on the import and reimport endpoints. - -Modes: - -- `async` (default): deduplication and the rest of post-processing are dispatched - to the background and the response returns immediately. This is the historical - behavior; nothing changes for existing users. -- `async_wait`: post-processing is still dispatched to the background, but the - request waits for deduplication to finish before responding. As a result the - `scan_added` notification and the statistics in the import/reimport response - reflect the deduplicated state (findings that turned out to be duplicates are - no longer counted/listed as new). JIRA push, product grading and other - non-deduplication tasks remain asynchronous and are not awaited. -- `sync`: import deduplication runs inline in the web request. - -The wait in `async_wait` is bounded by the new `DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT` -environment variable (default `60` seconds). If no worker picks up the work within -the timeout, the request responds anyway (degrading to the `async` outcome) rather -than hanging. - -The import/reimport response now also includes a `deduplication_complete` boolean -indicating whether deduplication had finished by the time the response was produced. - -### Relationship to `block_execution` - -The existing `block_execution` profile flag is unchanged. It remains the global -switch that forces **all** of a user's asynchronous tasks (notifications, JIRA -push, product grading, deduplication, ...) to run in the foreground. -`deduplication_execution_mode` is independent and narrower — it only affects -import/reimport deduplication post-processing. A user who has `block_execution` -enabled continues to get fully synchronous imports; the upgrade migration seeds -their `deduplication_execution_mode` to `sync` so behavior is unchanged. - -No action is required to upgrade. Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.60.0) for the contents of the release. +There are no special instructions for upgrading to 2.60.x. Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.60.0) for the contents of the release. diff --git a/docs/content/en/open_source/upgrading/3.1.md b/docs/content/en/open_source/upgrading/3.1.md index 540140c7aba..0adbbef6a27 100644 --- a/docs/content/en/open_source/upgrading/3.1.md +++ b/docs/content/en/open_source/upgrading/3.1.md @@ -2,10 +2,50 @@ title: 'Upgrading to DefectDojo Version 3.1.x' toc_hide: true weight: -20260615 -description: New optional setting DD_OS_MESSAGE_ENABLED to control the open-source promo banner. +description: New optional setting DD_OS_MESSAGE_ENABLED to control the open-source promo banner, and a new deduplication execution mode for import/reimport. --- There are no breaking changes when upgrading to 3.1.x. Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/3.1.0) for the contents of the release. ### New setting: `DD_OS_MESSAGE_ENABLED` This release adds the `DD_OS_MESSAGE_ENABLED` setting (default `True`), which controls the open-source promotional ("Upgrade to Pro") banner. The default preserves the existing behavior. Set `DD_OS_MESSAGE_ENABLED=False` to hide the banner; when disabled, DefectDojo skips the outbound request that fetches the message. + +## Deduplication execution mode for import/reimport + +This release adds a new `deduplication_execution_mode` setting that controls how +import/reimport deduplication post-processing is dispatched and whether the API +response waits for it. It can be set per user (profile) and overridden per request +on the import and reimport endpoints. + +Modes: + +- `async` (default): deduplication and the rest of post-processing are dispatched + to the background and the response returns immediately. This is the historical + behavior; nothing changes for existing users. +- `async_wait`: post-processing is still dispatched to the background, but the + request waits for deduplication to finish before responding. As a result the + `scan_added` notification and the statistics in the import/reimport response + reflect the deduplicated state (findings that turned out to be duplicates are + no longer counted/listed as new). JIRA push, product grading and other + non-deduplication tasks remain asynchronous and are not awaited. +- `sync`: import deduplication runs inline in the web request. + +The wait in `async_wait` is bounded by the new `DD_DEDUPLICATION_ASYNC_WAIT_TIMEOUT` +environment variable (default `60` seconds). If no worker picks up the work within +the timeout, the request responds anyway (degrading to the `async` outcome) rather +than hanging. + +The import/reimport response now also includes a `deduplication_complete` boolean +indicating whether deduplication had finished by the time the response was produced. + +### Relationship to `block_execution` + +The existing `block_execution` profile flag is unchanged. It remains the global +switch that forces **all** of a user's asynchronous tasks (notifications, JIRA +push, product grading, deduplication, ...) to run in the foreground. +`deduplication_execution_mode` is independent and narrower — it only affects +import/reimport deduplication post-processing. A user who has `block_execution` +enabled continues to get fully synchronous imports; the upgrade migration seeds +their `deduplication_execution_mode` to `sync` so behavior is unchanged. + +No action is required to upgrade.