NVIDIA
diff --git a/‎docker/Dockerfile.multi‎
Lines changed: 34 additions & 38 deletions b/‎docker/Dockerfile.multi‎
Lines changed: 34 additions & 38 deletions
diff --git a/‎docker/Makefile‎
Lines changed: 11 additions & 1 deletion b/‎docker/Makefile‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎docker/common/install.sh‎
Lines changed: 3 additions & 3 deletions b/‎docker/common/install.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎jenkins/L0_MergeRequest.groovy‎
Lines changed: 2 additions & 0 deletions b/‎jenkins/L0_MergeRequest.groovy‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 12 additions & 20 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 12 additions & 20 deletions
@@ -12,9 +12,10 @@ LABEL com.nvidia.eula="https://www.nvidia.com/en-us/agreements/enterprise-softwa
 LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/"
 
 # https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
-# The default values come from `nvcr.io/nvidia/pytorch`
-ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
-ENV ENV=${ENV:-/etc/shinit_v2}
+ARG SH_ENV="/etc/shinit_v2"
+ENV ENV=${SH_ENV}
+ARG BASH_ENV="/etc/bash.bashrc"
+ENV BASH_ENV=${BASH_ENV}
 
 ARG GITHUB_MIRROR=""
 RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
@@ -43,48 +44,41 @@ COPY docker/common/install.sh \
      docker/common/install_ucx.sh \
      docker/common/install_nixl.sh \
      docker/common/install_etcd.sh \
-     docker/common/install_mooncake.sh \
      ./
 
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
-    PYTHON_VERSION=${PYTHON_VERSION} \
-    bash ./install.sh --base && rm install_base.sh
-
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh
-
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh
-
-RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh
-
 ARG TRT_VER
 ARG CUDA_VER
 ARG CUDNN_VER
 ARG NCCL_VER
 ARG CUBLAS_VER
-RUN TRT_VER=${TRT_VER} \
+ARG TORCH_INSTALL_TYPE="skip"
+RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
+    PYTHON_VERSION=${PYTHON_VERSION} \
+    TRT_VER=${TRT_VER} \
     CUDA_VER=${CUDA_VER} \
     CUDNN_VER=${CUDNN_VER} \
     NCCL_VER=${NCCL_VER} \
     CUBLAS_VER=${CUBLAS_VER} \
-    bash ./install.sh --tensorrt && rm install_tensorrt.sh
-
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --polygraphy && rm install_polygraphy.sh
-
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh
-
-ARG TORCH_INSTALL_TYPE="skip"
-RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh
-
-RUN bash ./install.sh --opencv && rm install.sh
-
-# Install UCX first
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && rm install_ucx.sh
-
-# Install NIXL
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && rm install_nixl.sh
-
-# Install etcd
-RUN bash ./install_etcd.sh && rm install_etcd.sh
+    TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} \
+    bash ./install.sh --base --cmake --ccache --cuda_toolkit --tensorrt --polygraphy --mpi4py --pytorch --opencv && \
+    rm install_base.sh && \
+    rm install_cmake.sh && \
+    rm install_ccache.sh && \
+    rm install_cuda_toolkit.sh && \
+    rm install_tensorrt.sh && \
+    rm install_polygraphy.sh && \
+    rm install_mpi4py.sh && \
+    rm install_pytorch.sh && \
+    rm install.sh
+
+# Install UCX, NIXL, etcd
+# TODO: Combine these into the main install.sh script
+RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && \
+    GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && \
+    bash ./install_etcd.sh && \
+    rm install_ucx.sh && \
+    rm install_nixl.sh && \
+    rm install_etcd.sh
 
 FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton
 
@@ -99,16 +93,18 @@ COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches
 
 # Copy all installation scripts at once to reduce layers
 COPY docker/common/install_triton.sh \
+     docker/common/install_mooncake.sh \
      ./
 
-RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh
-
 # Install Mooncake, after triton handles boost requirement
-RUN if [ -f /etc/redhat-release ]; then \
+RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && \
+    if [ -f /etc/redhat-release ]; then \
         echo "Rocky8 detected, skipping mooncake installation"; \
     else \
         bash ./install_mooncake.sh; \
-    fi && rm install_mooncake.sh
+    fi && \
+    rm install_triton.sh && \
+    rm install_mooncake.sh
 
 FROM ${DEVEL_IMAGE} AS wheel
 WORKDIR /src/tensorrt_llm
 
@@ -75,8 +75,16 @@ define rewrite_tag
 $(shell echo $(IMAGE_WITH_TAG) | sed "s/\/tensorrt-llm:/\/tensorrt-llm-staging:/g")
 endef
 
+base_pull:
+	@echo "Pulling base image: $(BASE_IMAGE):$(BASE_TAG)"
+	docker pull $(BASE_IMAGE):$(BASE_TAG)
+
 %_build: DEVEL_IMAGE = $(if $(findstring 1,$(JENKINS_DEVEL)),$(shell . ../jenkins/current_image_tags.properties && echo $$LLM_DOCKER_IMAGE))
-%_build:
+%_build: SH_ENV      = $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \
+				  	   | grep '^ENV=' | sed 's/^[^=]*=//' 2>/dev/null)
+%_build: BASH_ENV    = $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \
+    				   | grep '^BASH_ENV=' | sed 's/^[^=]*=//' 2>/dev/null)
+%_build: base_pull
 	@echo "Building docker image: $(IMAGE_WITH_TAG)"
 	docker buildx build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \
 		--progress $(DOCKER_PROGRESS) \
@@ -97,6 +105,8 @@ endef
 		$(if $(GIT_COMMIT), --build-arg GIT_COMMIT="$(GIT_COMMIT)") \
 		$(if $(GITHUB_MIRROR), --build-arg GITHUB_MIRROR="$(GITHUB_MIRROR)") \
 		$(if $(PYTHON_VERSION), --build-arg PYTHON_VERSION="$(PYTHON_VERSION)") \
+		$(if $(SH_ENV), --build-arg SH_ENV="$(SH_ENV)") \
+		$(if $(BASH_ENV), --build-arg BASH_ENV="$(BASH_ENV)") \
 		$(if $(STAGE), --target $(STAGE)) \
 		--file Dockerfile.multi \
 		--tag $(IMAGE_WITH_TAG) \
 
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -Eeo pipefail
 shopt -s nullglob
 trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR
@@ -125,7 +125,7 @@ fi
 
 if [ $opencv -eq 1 ]; then
     echo "Installing OpenCV..."
-    pip3 uninstall -y opencv
+    bash -c "pip3 uninstall -y opencv"
     rm -rf /usr/local/lib/python3*/dist-packages/cv2/
-    pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
+    bash -c "pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir"
 fi
@@ -162,7 +162,7 @@ P99 E2EL (ms):                           1643.44
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -172,7 +172,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -182,14 +182,14 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 ### Request Time Breakdown
 
@@ -400,7 +400,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -410,7 +410,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -420,12 +420,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -350,7 +350,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -360,7 +360,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -370,12 +370,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -355,7 +355,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -365,7 +365,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -375,12 +375,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -347,7 +347,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -357,7 +357,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -367,12 +367,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -155,6 +155,8 @@ def globalVars = [
 boolean enableUpdateGitlabStatus =
     !testFilter[ENABLE_SKIP_TEST] &&
     !testFilter[ONLY_MULTI_GPU_TEST] &&
+    !testFilter[DISABLE_MULTI_GPU_TEST] &&
+    !testFilter[DEBUG_MODE] &&
     testFilter[GPU_TYPE_LIST] == null &&
     testFilter[TEST_STAGE_LIST] == null &&
     testFilter[TEST_BACKEND] == null
 
@@ -642,6 +642,11 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
                             echo "--gpus ${gpuCount}"
                         fi
                     """, returnStdout: true).trim()
+
+                    if (cluster.host.contains("dlcluster")) {
+                        dockerArgs += " " + sh(script: 'echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"', returnStdout: true).trim()
+                        dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
+                    }
                 }
 
                 dockerArgs = "${dockerArgs} " +
@@ -655,10 +660,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
                     "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
                     "--cap-add=SYSLOG"
 
-                if (partition.clusterName == "dlcluster") {
-                    dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
-                    dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
-                }
                 echo "Final dockerArgs: ${dockerArgs}"
             } else {
                 error "The Slurm node does not come online in the waiting period. Terminating the job."
@@ -750,6 +751,8 @@ def getPytestBaseCommandLine(
     extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
     // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
     extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+    // Enable NCCL debug information for multi-GPU tests
+    extraInternalEnv += " NCCL_DEBUG=INFO"
 
     def testCmdLine = [
         "LLM_ROOT=${llmSrc}",
@@ -996,8 +999,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     export resourcePathNode=$resourcePathNode
                     export pytestCommand="$pytestCommand"
                     export coverageConfigFile="$coverageConfigFile"
-                    export NVIDIA_IMEX_CHANNELS=0
-                    [ -z "\${NVIDIA_VISIBLE_DEVICES:-}" ] && export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
+                    export NVIDIA_IMEX_CHANNELS=\${NVIDIA_IMEX_CHANNELS:-0}
+                    export NVIDIA_VISIBLE_DEVICES=\${NVIDIA_VISIBLE_DEVICES:-\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))}
+
+                    echo "Env NVIDIA_IMEX_CHANNELS: \$NVIDIA_IMEX_CHANNELS"
+                    echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
 
                     ${srunPrologue}
 
@@ -2248,20 +2254,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         def noRegularTests = false
         def noIsolateTests = false
         def rerunFailed = false
-
-        echoNodeAndGpuInfo(pipeline, stageName)
-        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
-
-        def extraInternalEnv = ""
-        def pytestTestTimeout = "3600"
-
-        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
-        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
-        // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
-        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
-        // Enable NCCL debug information for multi-GPU tests
-        extraInternalEnv += " NCCL_DEBUG=INFO"
-
         def testDBList = renderTestDB(testList, llmSrc, stageName)
 
         // Process shard test list and create separate files for regular and isolate tests