NVIDIA · kevalmorabia97 · Dec 4, 2025 · Dec 3, 2025 · Dec 3, 2025
@@ -93,11 +93,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [llm_ptq]
+        example: [llm_ptq, vlm_ptq]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-1
@@ -111,7 +111,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -27,6 +27,7 @@ Model Optimizer Changelog (Linux)
 
 **Misc**
 
+- Bump TensorRT-LLM docker to 1.2.0rc4.
 - Bump minimum recommended transformers version to 4.53.
 - Replace ONNX simplification package from ``onnxsim`` to ``onnxslim``.
 

diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.1.0rc2.post2             |
+| TensorRT-LLM (Optional) |  1.2.0rc4                   |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+

@@ -27,7 +27,7 @@ This section focuses on Post-training quantization, a technique that reduces mod
 
 ### Docker
 
-For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`).
+For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4`).
 For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.09`).
 Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
 

@@ -4,7 +4,7 @@
 
 This benchmark is meant to be a lightweight layer ontop of an existing vLLM/SGLang/TRTLLM installation. For example, no install
 is required if one is running in the following dockers: `vllm/vllm-openai:v0.11.0` (vLLM), `lmsysorg/sglang:v0.5.4.post2` (SGLang), or
-`nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc1` (TRT-LLM).
+`nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4` (TRT-LLM).
 
 Next
 
@@ -16,7 +16,7 @@ cd examples/specdec_bench
 
 Collect relevant metrics on acceptance rate, timing, and outputs for Speculative Decoding methods.
 Acceptance rate refers to the number of tokens generated on every iteration.  For a standard Autoregressive LLM, this number
-is just 1.  
+is just 1.
 
 ## Getting Started
 

@@ -20,16 +20,16 @@
 # THE BIWEEKLY CAPACITY MEETING. IF YOU DON'T KNOW WHO IS THE PIC OF YOUR CSRG PPP
 # MANAGEMET, GO WITH `-p backfill -t 00:25:00`.
 
-#SBATCH -A coreai_dlalgo_modelopt
-#SBATCH --job-name=coreai_dlalgo_modelopt-generate_eagle_hidden_states
+#SBATCH -A <account_name>
+#SBATCH --job-name=<job_name>
 #SBATCH --nodes=1 --ntasks-per-node=4 --gpus-per-node=4
 #SBATCH -p batch
 #SBATCH -t 04:00:00
 
 echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"
 echo "SLURM_ARRAY_TASK_COUNT: $SLURM_ARRAY_TASK_COUNT"
 
-CONTAINER="nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0"
+CONTAINER="nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
 
 INPUT_DIR="<Can be directory containing the .jsonl files, or path to single .jsonl file>"
 DUMP_DIR="<Directory for output hidden states>"

diff --git a/tests/examples/vlm_ptq/test_qwen_vl.py b/tests/examples/vlm_ptq/test_qwen_vl.py
@@ -17,10 +17,8 @@
 import pytest
 from _test_utils.examples.models import QWEN_VL_PATH
 from _test_utils.examples.run_command import run_vlm_ptq_command
-from _test_utils.torch.misc import minimum_gpu
 
 
 @pytest.mark.parametrize("quant", ["fp8", "int8_sq", "nvfp4"])
-@minimum_gpu(2)
-def test_qwen_vl_multi_gpu(quant):
+def test_qwen_vl(quant):
     run_vlm_ptq_command(model=QWEN_VL_PATH, quant=quant)