diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 3cc9decdf..2cbd10b9c 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -23,6 +23,18 @@ check_env_vars() {
     fi
 }
 
+# Setup torch profiler environment when ENABLE_PROFILE=true
+# IMPORTANT: This runs when the library is sourced (before server starts)
+# so that the server can detect VLLM_TORCH_PROFILER_DIR and enable profiler endpoints
+if [ "${ENABLE_PROFILE}" = "true" ]; then
+    # Only set default directory if VLLM_TORCH_PROFILER_DIR is not already set
+    if [ -z "${VLLM_TORCH_PROFILER_DIR}" ]; then
+        export VLLM_TORCH_PROFILER_DIR="/workspace/profiling"
+    fi
+    mkdir -p "$VLLM_TORCH_PROFILER_DIR"
+    echo "Torch profiler enabled. Output directory: $VLLM_TORCH_PROFILER_DIR"
+fi
+
 # Wait for server to be ready by polling the health endpoint
 # All parameters are required
 # Parameters:
@@ -106,6 +118,7 @@ wait_for_server_ready() {
 #   --result-dir: Result directory
 #   --use-chat-template: Optional flag to enable chat template
 #   --server-pid: Optional server process ID to monitor during benchmark
+#   --enable-profile: Optional flag to enable torch profiler
 run_benchmark_serving() {
     set +x
     local model=""
@@ -121,6 +134,7 @@ run_benchmark_serving() {
     local workspace_dir=""
     local use_chat_template=false
     local server_pid=""
+    local enable_profile=false
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -176,6 +190,10 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --enable-profile)
+                enable_profile=true
+                shift
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -255,6 +273,11 @@ run_benchmark_serving() {
         benchmark_cmd+=(--use-chat-template)
     fi
 
+    # Add --profile if torch profiler is enabled (via --enable-profile flag or ENABLE_PROFILE env var)
+    if [[ "$enable_profile" == true ]] || [[ "${ENABLE_PROFILE}" == "true" ]]; then
+        benchmark_cmd+=(--profile)
+    fi
+
     # Run benchmark with optional server monitoring
     set -x
     if [[ -n "$server_pid" ]]; then
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f0dbf2107..4a8c4b22f 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -32,6 +32,7 @@ docker run --rm --init --network host --name $server_name \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e RUNNER_TYPE \
+-e ENABLE_PROFILE -e VLLM_TORCH_PROFILER_DIR \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}.sh"
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 976d5a5fd..eeeb42c25 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -12,6 +12,7 @@ docker run --rm --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
+-e ENABLE_PROFILE -e VLLM_TORCH_PROFILER_DIR \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100.sh"
diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh
index cc70d5bbc..f88c470d1 100644
--- a/runners/launch_mi300x-amd.sh
+++ b/runners/launch_mi300x-amd.sh
@@ -15,6 +15,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
+-e ENABLE_PROFILE -e VLLM_TORCH_PROFILER_DIR \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x.sh"
diff --git a/runners/launch_mi300x-cr.sh b/runners/launch_mi300x-cr.sh
index b2dbaee83..0ef826467 100644
--- a/runners/launch_mi300x-cr.sh
+++ b/runners/launch_mi300x-cr.sh
@@ -15,6 +15,7 @@ docker run --rm --ipc=host --shm-size=16g --network=host --name=$server_name \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \
 -e ISL -e OSL -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RANDOM_RANGE_RATIO -e RESULT_FILENAME -e RUN_EVAL -e RUNNER_TYPE \
+-e ENABLE_PROFILE -e VLLM_TORCH_PROFILER_DIR \
 --entrypoint=/bin/bash \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi300x.sh"