ci: run benchmark configs in parallel (#15364)

brettlangdon · web-flow · commit a82e9b147cc7 · 2025-11-21T20:20:54.000Z
This change parallelizes the microbenchmark jobs to make more efficient
use of CI time and compute resources
diff --git a/.gitlab/benchmarks/bp-runner.microbenchmarks.fail-on-breach.yml b/.gitlab/benchmarks/bp-runner.microbenchmarks.fail-on-breach.yml
@@ -726,19 +726,19 @@ experiments:
           - name: iastpropagation-no-propagation
             thresholds:
               - execution_time < 0.06 ms
-              - max_rss_usage < 40.50 MB
+              - max_rss_usage < 42.00 MB
           - name: iastpropagation-propagation_enabled
             thresholds:
               - execution_time < 0.19 ms
-              - max_rss_usage < 40.00 MB
+              - max_rss_usage < 42.00 MB
           - name: iastpropagation-propagation_enabled_100
             thresholds:
               - execution_time < 2.30 ms
-              - max_rss_usage < 40.00 MB
+              - max_rss_usage < 42.00 MB
           - name: iastpropagation-propagation_enabled_1000
             thresholds:
               - execution_time < 34.55 ms
-              - max_rss_usage < 40.00 MB
+              - max_rss_usage < 42.00 MB
 
           # otelsdkspan
           - name: otelsdkspan-add-event
diff --git a/.gitlab/benchmarks/microbenchmarks.yml b/.gitlab/benchmarks/microbenchmarks.yml
@@ -10,7 +10,7 @@ variables:
   PACKAGE_IMAGE: registry.ddbuild.io/images/mirror/pypa/manylinux2014_x86_64:2025-04-12-5990e2d
   GITHUB_CLI_IMAGE: registry.ddbuild.io/images/dd-octo-sts-ci-base:2025.06-1
   BENCHMARKING_BRANCH: dd-trace-py
-  BENCHMARKING_COMMIT_SHA: e7bbac96e1ae9bfb5f8906dcdf103b08f5ca0805
+  BENCHMARKING_COMMIT_SHA: 32681a9f805f4d62cf6bd7d205ddeb83ab72288d
 
 .benchmarks:
   stage: test
@@ -24,8 +24,6 @@ variables:
   timeout: 30m
   dependencies: [ "baseline:build", "candidate" ]
   script: |
-    export REPORTS_DIR="$(pwd)/reports/" && (mkdir "${REPORTS_DIR}" || :)
-
     if [[ -n "$CI_JOB_TOKEN" ]];
     then
       git config --global url."https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/".insteadOf "https://github.com/DataDog/"
@@ -34,18 +32,28 @@ variables:
     (cd /platform && git reset --hard "${BENCHMARKING_COMMIT_SHA}")
     export PATH="$PATH:/platform/steps"
 
-    capture-hardware-software-info.sh
+    for SCENARIO in $(echo "$SCENARIOS" | tr -s '[:space:]' ' ');
+    do
+      export REPORTS_DIR="$(pwd)/reports/${SCENARIO}/" && (mkdir -p "${REPORTS_DIR}" || :)
 
-    if [[ $SCENARIO =~ ^flask_* || $SCENARIO =~ ^django_* ]];
-    then
-      BP_SCENARIO=$SCENARIO bp-runner "${CI_PROJECT_DIR:-.}/.gitlab/benchmarks/bp-runner.yml" --debug -t
-    else
-      run-benchmarks.sh
-    fi
+      capture-hardware-software-info.sh
+
+      if [[ $SCENARIO =~ ^flask_* || $SCENARIO =~ ^django_* ]];
+      then
+        BP_SCENARIO=$SCENARIO bp-runner "${CI_PROJECT_DIR:-.}/.gitlab/benchmarks/bp-runner.yml" --debug -t
+      else
+        run-benchmarks.sh
+      fi
 
-    analyze-results.sh
+      # Join all config results into a single results.json
+      .gitlab/benchmarks/steps/combine-results.sh "/artifacts/${CI_JOB_ID}-${SCENARIO}/candidate/"
+      .gitlab/benchmarks/steps/combine-results.sh "/artifacts/${CI_JOB_ID}-${SCENARIO}/baseline/"
 
-    upload-results-to-s3.sh || :
+      analyze-results.sh
+      upload-results-to-s3.sh || :
+      # Copy converted JSON reports to common location
+      cp $REPORTS_DIR/*.converted.json $(pwd)/reports/
+    done
 
     # We have to move artifacts to ${CI_PROJECT_DIR} if we want to attach as GitLab artifact
     cp -R /artifacts ${CI_PROJECT_DIR}/
@@ -146,40 +154,24 @@ candidate:
 microbenchmarks:
   extends: .benchmarks
   parallel:
+    # DEV: The organization into these groups is mostly arbitrary, based on observed runtimes and
+    #      trying to keep total runtime per job <10 minutes
     matrix:
-      - SCENARIO:
-        - "span"
-        - "tracer"
-        - "sampling_rule_matches"
-        - "set_http_meta"
-        - "django_simple"
-        - "flask_simple"
-        - "flask_sqli"
-        - "core_api"
-        - "otel_span"
-        - "otel_sdk_span"
-        - "appsec_iast_aspects"
-        - "appsec_iast_aspects_ospath"
-        - "appsec_iast_aspects_re_module"
-        - "appsec_iast_aspects_split"
+      - CPUS_PER_RUN: "1"
+        SCENARIOS:
+        - "span tracer core_api set_http_meta telemetry_add_metric otel_span otel_sdk_span recursive_computation sampling_rule_matches"
+        - "http_propagation_extract http_propagation_inject rate_limiter appsec_iast_aspects appsec_iast_aspects_ospath appsec_iast_aspects_re_module appsec_iast_aspects_split appsec_iast_propagation"
+        - "packages_package_for_root_module_mapping packages_update_imported_dependencies"
+      - CPUS_PER_RUN: "2"
+        SCENARIOS:
+        - "django_simple flask_simple flask_sqli errortracking_django_simple errortracking_flask_sqli"
         # Flaky timeouts on starting up
         # - "appsec_iast_django_startup"
-        # TOOD: Re-enable when this issue is resolved:
-        - "appsec_iast_propagation"
-        - "errortracking_django_simple"
         # They take a long time to run and frequently time out
         # TODO: Make benchmarks faster, or run less frequently, or as macrobenchmarks
         # - "appsec_iast_django_startup"
-        - "errortracking_flask_sqli"
         # Flaky. Timeout errors
         # - "encoder"
-        - "http_propagation_extract"
-        - "http_propagation_inject"
-        - "rate_limiter"
-        - "packages_package_for_root_module_mapping"
-        - "packages_update_imported_dependencies"
-        - "recursive_computation"
-        - "telemetry_add_metric"
         # They take a long time to run, and now need the agent running
         # TODO: Make benchmarks faster, or run less frequently, or as macrobenchmarks
         # - "startup"
diff --git a/.gitlab/benchmarks/steps/combine-results.sh b/.gitlab/benchmarks/steps/combine-results.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -exo pipefail
+
+ARTIFACTS_DIR="${1}"
+
+# Combine all the individual results into a single results fule.
+# We need:
+#   - to merge all the benchmarks into a single list
+#   - to keep only one copy of the metadata, removing fields that are per-benchmark specific
+#   - add benchmark specific metadata into each benchmark entry
+jq -s '
+  map(
+    . as $file
+    | .benchmarks |= map(
+        .metadata = ($file.metadata | { name, loops, cpu_affinity, cpu_config, cpu_freq } )
+      )
+    | {
+        benchmarks: .benchmarks,
+        leftover_meta: (.metadata | del(.name, .loops, .cpu_affinity, .cpu_config, .cpu_freq))
+      }
+  )
+  |
+  {
+    benchmarks: (map(.benchmarks) | add),
+    metadata:   (first | .leftover_meta)
+  }
+' $ARTIFACTS_DIR/results.*.json > "${ARTIFACTS_DIR}/results.json"
diff --git a/benchmarks/base/run.py b/benchmarks/base/run.py
@@ -2,8 +2,12 @@
 
 import json
 import os
+import queue
 import subprocess
 import sys
+import threading
+from typing import Any
+from typing import Optional
 
 import yaml
 
@@ -16,13 +20,39 @@ def read_config(path):
         return yaml.load(fp, Loader=yaml.FullLoader)
 
 
-def run(scenario_py, cname, cvars, output_dir):
+def cpu_affinity_to_cpu_groups(cpu_affinity: str, cpus_per_run: int) -> list[list[int]]:
+    # CPU_AFFINITY is a comma-separated list of CPU IDs and ranges
+    #   6-11
+    #   6-11,14,15
+    #   6-11,13-15,16,18,20-21
+    cpu_ids: list[int] = []
+    for part in cpu_affinity.split(","):
+        if "-" in part:
+            start, end = part.split("-")
+            cpu_ids.extend(range(int(start), int(end) + 1))
+        else:
+            cpu_ids.append(int(part))
+
+    if len(cpu_ids) % cpus_per_run != 0:
+        raise ValueError(f"CPU count {len(cpu_ids)} not divisible by CPUS_PER_RUN={cpus_per_run}")
+    cpu_groups = [cpu_ids[i : i + cpus_per_run] for i in range(0, len(cpu_ids), cpus_per_run)]
+    return cpu_groups
+
+
+def run(scenario_py: str, cname: str, cvars: dict[str, Any], output_dir: str, cpus: Optional[list[int]] = None):
+    cmd: list[str] = []
+
+    if cpus:
+        # Use taskset to set CPU affinity
+        cpu_list_str = ",".join(str(cpu) for cpu in cpus)
+        cmd += ["taskset", "-c", cpu_list_str]
+
     if SHOULD_PROFILE:
         # viztracer won't create the missing directory itself
         viztracer_output_dir = os.path.join(output_dir, "viztracer")
         os.makedirs(viztracer_output_dir, exist_ok=True)
 
-        cmd = [
+        cmd += [
             "viztracer",
             "--minimize_memory",
             "--min_duration",
@@ -33,14 +63,14 @@ def run(scenario_py, cname, cvars, output_dir):
             os.path.join(output_dir, "viztracer", "{}.json".format(cname)),
         ]
     else:
-        cmd = ["python"]
+        cmd += ["python"]
 
     cmd += [
         scenario_py,
         # necessary to copy PYTHONPATH for venvs
         "--copy-env",
-        "--append",
-        os.path.join(output_dir, "results.json"),
+        "--output",
+        os.path.join(output_dir, f"results.{cname}.json"),
         "--name",
         cname,
     ]
@@ -72,5 +102,45 @@ def run(scenario_py, cname, cvars, output_dir):
         config = {k: v for k, v in config.items() if k in allowed_configs}
         print("Filtering to configs: {}".format(", ".join(sorted(config.keys()))))
 
+    CPU_AFFINITY = os.environ.get("CPU_AFFINITY")
+
+    # No CPU affinity specified, run sequentially
+    if not CPU_AFFINITY:
+        for cname, cvars in config.items():
+            run("scenario.py", cname, cvars, output_dir)
+        sys.exit(0)
+
+    CPUS_PER_RUN = int(os.environ.get("CPUS_PER_RUN", "1"))
+    cpu_groups = cpu_affinity_to_cpu_groups(CPU_AFFINITY, CPUS_PER_RUN)
+
+    print(f"Running with CPU affinity: {CPU_AFFINITY}")
+    print(f"CPUs per run: {CPUS_PER_RUN}")
+    print(f"CPU groups: {list(cpu_groups)}")
+
+    job_queue = queue.Queue()
+    cpu_queue = queue.Queue()
+
+    def worker(cpu_queue: queue.Queue, job_queue: queue.Queue):
+        while job_queue.qsize() > 0:
+            cname, cvars = job_queue.get(timeout=1)
+
+            cpus = cpu_queue.get()
+            print(f"Starting run {cname} on CPUs {cpus}")
+            run("scenario.py", cname, cvars, output_dir, cpus=cpus)
+            print(f"Finished run {cname}")
+            cpu_queue.put(cpus)
+
     for cname, cvars in config.items():
-        run("scenario.py", cname, cvars, output_dir)
+        job_queue.put((cname, cvars))
+
+    workers = []
+    print(f"Starting {len(cpu_groups)} worker threads")
+    for cpus in cpu_groups:
+        cpu_queue.put(cpus)
+        t = threading.Thread(target=worker, args=(cpu_queue, job_queue))
+        t.start()
+        workers.append(t)
+
+    for t in workers:
+        t.join()
+    print("All runs completed.")
diff --git a/ddtrace/contrib/internal/trace_utils_base.py b/ddtrace/contrib/internal/trace_utils_base.py
@@ -150,7 +150,10 @@ def _set_url_tag(integration_config: IntegrationConfig, span: Span, url: str, qu
         # users should set ``DD_TRACE_HTTP_CLIENT_TAG_QUERY_STRING=False``. This case should be
         # removed when config.global_query_string_obfuscation_disabled is removed (v3.0).
         span._set_tag_str(http.URL, url)
-    elif getattr(config._obfuscation_query_string_pattern, "pattern", None) == b"":
+    elif (
+        config._obfuscation_query_string_pattern is None
+        or getattr(config._obfuscation_query_string_pattern, "pattern", None) == b""
+    ):
         # obfuscation is disabled when DD_TRACE_OBFUSCATION_QUERY_STRING_REGEXP=""
         span._set_tag_str(http.URL, strip_query_string(url))
     else: