From 4630fe2c5130b0a56c43f28c77b96764b64876fe Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Sun, 1 Feb 2026 23:08:13 +0000
Subject: [PATCH 1/5] Integrate Automated QDQ placement tool - part 4.1

Signed-off-by: Will Guo <willg@nvidia.com>
---
 examples/qdq_placement/README.md         | 234 +++++++++++++++++++++++
 examples/qdq_placement/set_batch_size.py | 121 ++++++++++++
 modelopt/onnx/logging_config.py          |   2 +-
 3 files changed, 356 insertions(+), 1 deletion(-)
 create mode 100644 examples/qdq_placement/README.md
 create mode 100644 examples/qdq_placement/set_batch_size.py

diff --git a/examples/qdq_placement/README.md b/examples/qdq_placement/README.md
new file mode 100644
index 000000000..c289ea518
--- /dev/null
+++ b/examples/qdq_placement/README.md
@@ -0,0 +1,234 @@
+# QDQ Placement Optimization Example
+
+This example demonstrates automated Q/DQ (Quantize/Dequantize) node placement optimization for ONNX models using TensorRT performance measurements.
+
+## Prerequisites
+
+### Get the Model
+
+Download the ResNet50 model from the ONNX Model Zoo:
+
+```bash
+# Download ResNet50 from ONNX Model Zoo
+curl -L -o resnet50_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet50_Opset17_torch_hub/resnet50_Opset17.onnx
+```
+
+### Set Fixed Batch Size (Recommended)
+
+The downloaded model has a dynamic batch size. For best performance with TensorRT benchmarking, set a fixed batch size:
+
+```bash
+# Set batch size to 128 using the provided script
+python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
+
+# Or for other batch sizes
+python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
+```
+
+This creates `resnet50.bs128.onnx` with a fixed batch size of 128, which is optimal for TensorRT performance benchmarking.
+
+**Note:** The script requires the `onnx` package. If you have modelopt installed, this dependency should already be available.
+
+### What's in This Directory
+
+- `set_batch_size.py` - Script to convert dynamic batch size models to fixed batch size
+- `README.md` - This guide
+
+**Note:** ONNX model files are not included in the repository (excluded via `.gitignore`). Download and prepare them using the instructions above.
+
+## Quick Start
+
+### Basic Usage
+
+Optimize the ResNet50 model with INT8 quantization:
+
+```bash
+# Using the fixed batch size model (recommended)
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_results \
+    --quant-type int8 \
+    --schemes-per-region 30
+
+# Or use the original dynamic batch size model
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50_Opset17.onnx \
+    --output ./resnet50_results \
+    --quant-type int8 \
+    --schemes-per-region 30
+```
+
+This will:
+
+1. Automatically discover optimization regions in your model
+2. Test 30 different Q/DQ placement schemes per region pattern
+3. Measure TensorRT performance for each scheme
+4. Export the best optimized model to `./resnet50_results/optimized_final.onnx`
+
+### FP8 Quantization
+
+For FP8 quantization (faster on modern GPUs):
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_fp8_results \
+    --quant-type fp8 \
+    --schemes-per-region 50
+```
+
+### Faster Exploration
+
+For quick experiments, reduce the number of schemes:
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_quick \
+    --schemes-per-region 15
+```
+
+## Output Structure
+
+After running, you'll get:
+
+```log
+resnet50_results/
+├── optimized_final.onnx              # Your optimized model
+├── baseline.onnx                     # Baseline for comparison
+├── autotuner_state.yaml              # Resume checkpoint
+├── autotuner_state_pattern_cache.yaml # Reusable patterns
+└── logs/
+    ├── baseline.log                  # TensorRT baseline log
+    ├── region_*_scheme_*.log         # Per-scheme logs
+    └── final.log                     # Final model log
+```
+
+## Using the Optimized Model
+
+Deploy with TensorRT:
+
+```bash
+trtexec --onnx=resnet50_results/optimized_final.onnx \
+        --saveEngine=resnet50.engine \
+        --stronglyTyped
+```
+
+## Pattern Cache (Transfer Learning)
+
+Reuse learned patterns on similar models:
+
+```bash
+# First optimization on ResNet50
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_run
+
+# Download and prepare ResNet101 (or any similar model)
+curl -L -o resnet101_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet101-v2-7.onnx
+python3 set_batch_size.py resnet101_Opset17.onnx --batch-size 128 --output resnet101.bs128.onnx
+
+# Reuse patterns from ResNet50 on ResNet101 (much faster!)
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet101.bs128.onnx \
+    --output ./resnet101_run \
+    --pattern-cache ./resnet50_run/autotuner_state_pattern_cache.yaml
+```
+
+## Optimize from Existing QDQ Model
+
+If you already have a quantized model (e.g., from manual quantization or another tool), you can use it as a starting point to potentially find even better Q/DQ placements:
+
+```bash
+# Use an existing QDQ model as baseline
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_improved \
+    --qdq-baseline resnet50_quantized.onnx \
+    --schemes-per-region 40
+```
+
+This will:
+
+1. Extract Q/DQ insertion points from the baseline model
+2. Use them as seed schemes during optimization
+3. Generate and test variations to find better placements
+4. Compare against the baseline performance
+
+**Use cases:**
+
+- **Improve existing quantization**: Fine-tune manually quantized models
+- **Compare tools**: Test if autotuner can beat other quantization methods
+- **Bootstrap optimization**: Start from expert-tuned schemes
+
+**Example workflow:**
+
+```bash
+# Step 1: Create initial quantized model with any quantization tool
+# For example, using modelopt's quantize function:
+python3 -c "
+import numpy as np
+from modelopt.onnx.quantization import quantize
+
+# Create dummy calibration data (replace with real data for production)
+dummy_input = np.random.randn(128, 3, 224, 224).astype(np.float32)
+quantize(
+    'resnet50.bs128.onnx',
+    calibration_data=dummy_input,
+    calibration_method='entropy',
+    output_path='resnet50_quantized.onnx'
+)
+"
+
+# Step 2: Use the quantized baseline for autotuning
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_autotuned \
+    --qdq-baseline resnet50_quantized.onnx \
+    --schemes-per-region 50
+
+# The autotuner will try to find better Q/DQ placements than the initial quantization
+```
+
+**Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of your inference workload.
+
+## Programmatic API Usage
+
+All examples above use the command-line interface. For **low-level programmatic control** in your Python code, use the Python API directly. This allows you to:
+
+- Integrate autotuning into custom pipelines
+- Implement custom evaluation functions
+- Control state management and checkpointing
+- Build custom optimization workflows
+
+**See the API Reference documentation for low-level usage:**
+
+- [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
+
+The API docs include detailed examples of:
+
+- Using the `Autotuner` class directly
+- Customizing region discovery and scheme generation
+- Managing optimization state programmatically
+- Implementing custom performance evaluators
+
+## Documentation
+
+For comprehensive documentation on QDQ placement optimization, see:
+
+- **User Guide**: [`docs/source/guides/9_qdq_placement.rst`](../../docs/source/guides/9_qdq_placement.rst)
+  - Detailed explanations of how the autotuner works
+  - Advanced usage patterns and best practices
+  - Configuration options and performance tuning
+  - Troubleshooting common issues
+
+- **API Reference**: [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
+  - Complete API documentation for all classes and functions
+  - Low-level usage examples
+  - State management and pattern cache details
+
+For command-line help:
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune --help
+```
diff --git a/examples/qdq_placement/set_batch_size.py b/examples/qdq_placement/set_batch_size.py
new file mode 100644
index 000000000..205dbb551
--- /dev/null
+++ b/examples/qdq_placement/set_batch_size.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Script to set a fixed batch size for ONNX models.
+
+This script modifies an ONNX model with dynamic batch size to use a fixed batch size,
+which is often beneficial for TensorRT performance benchmarking.
+
+Usage:
+    python set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
+"""
+
+import argparse
+
+import onnx
+from onnx import shape_inference
+
+
+def set_batch_size(model_path: str, batch_size: int, output_path: str) -> None:
+    """
+    Set a fixed batch size for an ONNX model.
+
+    Args:
+        model_path: Path to input ONNX model
+        batch_size: Desired batch size
+        output_path: Path to save modified model
+    """
+    # Load the model
+    print(f"Loading model from {model_path}...")
+    model = onnx.load(model_path)
+
+    # Get the input tensor
+    graph = model.graph
+    input_tensor = graph.input[0]
+
+    print(
+        f"Original input shape: {[d.dim_param or d.dim_value for d in input_tensor.type.tensor_type.shape.dim]}"
+    )
+
+    # Modify the batch dimension (first dimension)
+    if len(input_tensor.type.tensor_type.shape.dim) > 0:
+        input_tensor.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # Clear any symbolic dimension parameter
+        input_tensor.type.tensor_type.shape.dim[0].ClearField("dim_param")
+
+    # Also update output shapes if needed
+    for output_tensor in graph.output:
+        if len(output_tensor.type.tensor_type.shape.dim) > 0:
+            output_tensor.type.tensor_type.shape.dim[0].dim_value = batch_size
+            output_tensor.type.tensor_type.shape.dim[0].ClearField("dim_param")
+
+    print(
+        f"Modified input shape: {[d.dim_param or d.dim_value for d in input_tensor.type.tensor_type.shape.dim]}"
+    )
+
+    # Run shape inference to propagate the batch size through the model
+    print("Running shape inference...")
+    try:
+        model = shape_inference.infer_shapes(model)
+    except Exception as e:
+        print(f"Warning: Shape inference failed: {e}")
+        print("Continuing without shape inference...")
+
+    # Save the modified model
+    print(f"Saving modified model to {output_path}...")
+    onnx.save(model, output_path)
+
+    # Verify the saved model
+    print("Verifying model...")
+    onnx.checker.check_model(output_path)
+    print("✓ Model saved and verified successfully!")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Set a fixed batch size for an ONNX model",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Set batch size to 128 for ResNet50
+  python set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
+
+  # Set batch size to 1 for single-image inference
+  python set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
+        """,
+    )
+
+    parser.add_argument("model", help="Path to input ONNX model")
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=128, help="Batch size to set (default: 128)"
+    )
+    parser.add_argument(
+        "--output", "-o", help="Path to save modified model (default: <model>_bs<batch_size>.onnx)"
+    )
+
+    args = parser.parse_args()
+
+    # Generate output path if not provided
+    if args.output is None:
+        base_name = args.model.rsplit(".", 1)[0]
+        args.output = f"{base_name}.bs{args.batch_size}.onnx"
+
+    set_batch_size(args.model, args.batch_size, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modelopt/onnx/logging_config.py b/modelopt/onnx/logging_config.py
index 99468b87f..bcc83cdc9 100644
--- a/modelopt/onnx/logging_config.py
+++ b/modelopt/onnx/logging_config.py
@@ -38,7 +38,7 @@ def configure_logging(level=logging.INFO, log_file=None):
     for handler in logger.handlers[:]:
         logger.removeHandler(handler)
 
-    formatter = logging.Formatter("[modelopt][onnx] - %(levelname)s - %(message)s")
+    formatter = logging.Formatter("%(asctime)s - [modelopt][onnx] - %(levelname)s - %(message)s")
 
     # Add file handler if log_file is specified
     if log_file:

From b05fb82815d6be23eb2c82b4cadbfb9c6780a8a8 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Tue, 3 Feb 2026 02:40:06 +0000
Subject: [PATCH 2/5] Add remote autotuning to example

Signed-off-by: Will Guo <willg@nvidia.com>
---
 examples/qdq_placement/README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/qdq_placement/README.md b/examples/qdq_placement/README.md
index c289ea518..7f20c6b68 100644
--- a/examples/qdq_placement/README.md
+++ b/examples/qdq_placement/README.md
@@ -181,13 +181,22 @@ quantize(
 "
 
 # Step 2: Use the quantized baseline for autotuning
+# The autotuner will try to find better Q/DQ placements than the initial quantization
 python3 -m modelopt.onnx.quantization.autotune \
     --model resnet50.bs128.onnx \
     --output ./resnet50_autotuned \
     --qdq-baseline resnet50_quantized.onnx \
     --schemes-per-region 50
 
-# The autotuner will try to find better Q/DQ placements than the initial quantization
+# TensorRT 10.16 support remote autotuning, pass remoteAutoTuningConfig to trtexec to
+# benchmark with remote autotuning.
+python3 -m modelopt.onnx.quantization.autotune \
+    --model resnet50.bs128.onnx \
+    --output ./resnet50_autotuned \
+    --qdq-baseline resnet50_quantized.onnx \
+    --schemes-per-region 50 \
+    --use_trtexec \
+    --trtexec_benchmark_args "--remoteAutoTuningConfig=\"<remote autotuning config>\""
 ```
 
 **Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of your inference workload.

From c06a40559e11e5818e5a7c2c4d4b2d7081834601 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Wed, 4 Feb 2026 02:48:28 +0000
Subject: [PATCH 3/5] update remote autotuner example

Signed-off-by: Will Guo <willg@nvidia.com>
---
 examples/qdq_placement/README.md | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/examples/qdq_placement/README.md b/examples/qdq_placement/README.md
index 7f20c6b68..beed4f007 100644
--- a/examples/qdq_placement/README.md
+++ b/examples/qdq_placement/README.md
@@ -114,7 +114,7 @@ trtexec --onnx=resnet50_results/optimized_final.onnx \
         --stronglyTyped
 ```
 
-## Pattern Cache (Transfer Learning)
+## Pattern Cache
 
 Reuse learned patterns on similar models:
 
@@ -187,19 +187,32 @@ python3 -m modelopt.onnx.quantization.autotune \
     --output ./resnet50_autotuned \
     --qdq-baseline resnet50_quantized.onnx \
     --schemes-per-region 50
+```
+
+**Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of your inference workload.
+
+## Remote Autotuning with TensorRT
+
+TensorRT 10.16+ supports remote autotuning, which allows you to offload TensorRT's optimization process to remote hardware. This is useful when you want to optimize models for different target GPUs without having direct access to them.
+
+To use remote autotuning during Q/DQ placement optimization:
 
-# TensorRT 10.16 support remote autotuning, pass remoteAutoTuningConfig to trtexec to
-# benchmark with remote autotuning.
+```bash
 python3 -m modelopt.onnx.quantization.autotune \
     --model resnet50.bs128.onnx \
-    --output ./resnet50_autotuned \
-    --qdq-baseline resnet50_quantized.onnx \
+    --output ./resnet50_remote_autotuned \
     --schemes-per-region 50 \
     --use_trtexec \
     --trtexec_benchmark_args "--remoteAutoTuningConfig=\"<remote autotuning config>\""
 ```
 
-**Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of your inference workload.
+**Requirements:**
+
+- TensorRT 10.16 or later
+- Valid remote autotuning configuration
+- `--use_trtexec` flag must be enabled
+
+Replace `<remote autotuning config>` with your actual remote autotuning configuration string provided by your TensorRT setup.
 
 ## Programmatic API Usage
 

From cda5eb9ee60f6d5e39e2046483c958f2ee20786b Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Mon, 2 Mar 2026 09:26:02 +0000
Subject: [PATCH 4/5] resolve comments

Signed-off-by: Will Guo <willg@nvidia.com>
---
 examples/onnx/autoqdq/README.md               | 299 ++++++++++++++++++
 .../autoqdq}/set_batch_size.py                |  39 ++-
 examples/qdq_placement/README.md              | 256 ---------------
 3 files changed, 329 insertions(+), 265 deletions(-)
 create mode 100644 examples/onnx/autoqdq/README.md
 rename examples/{qdq_placement => onnx/autoqdq}/set_batch_size.py (73%)
 delete mode 100644 examples/qdq_placement/README.md

diff --git a/examples/onnx/autoqdq/README.md b/examples/onnx/autoqdq/README.md
new file mode 100644
index 000000000..01772a993
--- /dev/null
+++ b/examples/onnx/autoqdq/README.md
@@ -0,0 +1,299 @@
+# QDQ Placement Optimization Example
+
+This example demonstrates automated Q/DQ (Quantize/Dequantize) node placement optimization for ONNX models using TensorRT performance measurements.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+  - [Get the Model](#get-the-model)
+  - [Set Fixed Batch Size](#set-fixed-batch-size)
+  - [What's in This Directory](#whats-in-this-directory)
+- [Quick Start](#quick-start)
+  - [Basic Usage](#basic-usage)
+  - [FP8 Quantization](#fp8-quantization)
+  - [Faster Exploration](#faster-exploration)
+- [Output Structure](#output-structure)
+- [Region Inspection](#region-inspection)
+- [Using the Optimized Model](#using-the-optimized-model)
+- [Pattern Cache](#pattern-cache)
+- [Optimize from Existing QDQ Model](#optimize-from-existing-qdq-model)
+- [Remote Autotuning with TensorRT](#remote-autotuning-with-tensorrt)
+- [Programmatic API Usage](#programmatic-api-usage)
+- [Documentation](#documentation)
+
+## Prerequisites
+
+### Get the Model
+
+Download the ResNet50 model from the ONNX Model Zoo:
+
+```bash
+# Download ResNet50 from ONNX Model Zoo
+curl -L -o resnet50_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet50_Opset17_torch_hub/resnet50_Opset17.onnx
+```
+
+### Set Fixed Batch Size
+
+The downloaded model has a dynamic batch size. For best performance with TensorRT benchmarking, set a fixed batch size:
+
+```bash
+# Set batch size to 128 using the provided script
+python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
+
+# Or for other batch sizes
+python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
+```
+
+This creates `resnet50.bs128.onnx` with a fixed batch size of 128, which is optimal for TensorRT performance benchmarking.
+
+**Note:** The script requires the `onnx` package.
+
+### What's in This Directory
+
+- `set_batch_size.py` - Script to convert dynamic batch size models to fixed batch size
+- `README.md` - This guide
+
+**Note:** ONNX model files are not included in the repository (excluded via `.gitignore`). Download and prepare them using the instructions above.
+
+## Quick Start
+
+### Basic Usage
+
+Optimize the ResNet50 model with INT8 quantization:
+
+```bash
+# Using the fixed batch size model
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_results \
+    --quant_type int8 \
+    --schemes_per_region 30
+
+# Or use the original dynamic batch size model, batch is set to 1 in benchmark
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50_Opset17.onnx \
+    --output_dir ./resnet50_results \
+    --quant_type int8 \
+    --schemes_per_region 30
+```
+
+Short options: `-m` for `--onnx_path`, `-o` for `--output_dir`, `-s` for `--schemes_per_region`. Default output directory is `./autotuner_output` if `--output_dir` is omitted.
+
+This will:
+
+1. Automatically discover optimization regions in the model
+2. Test 30 different Q/DQ placement schemes per region pattern
+3. Measure TensorRT performance for each scheme
+4. Export the best optimized model to `./resnet50_results/optimized_final.onnx`
+
+### FP8 Quantization
+
+For FP8 quantization:
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_fp8_results \
+    --quant_type fp8 \
+    --schemes_per_region 50
+```
+
+### Faster Exploration
+
+For quick experiments, reduce the number of schemes:
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_quick \
+    --schemes_per_region 15
+```
+
+## Output Structure
+
+After running, the output workspace will be:
+
+```log
+resnet50_results/
+├── optimized_final.onnx              # Optimized model
+├── baseline.onnx                     # Baseline for comparison
+├── autotuner_state.yaml              # Resume checkpoint
+├── autotuner_state_pattern_cache.yaml # Reusable pattern cache
+├── logs/
+│   ├── baseline.log                  # TensorRT baseline log
+│   ├── region_*_scheme_*.log         # Per-scheme logs
+│   └── final.log                     # Final model log
+└── region_models/                    # Best model per region (intermediate)
+    └── region_*_level_*.onnx
+```
+
+## Region Inspection
+
+To debug how the autotuner discovers and partitions regions in your model, use the `region_inspect` tool. It runs the same region search as the autotuner and prints the region hierarchy, node counts, and summary statistics (without running benchmarks).
+
+```bash
+# Basic inspection (regions with quantizable ops only)
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx
+
+# Verbose mode for detailed debug logging
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --verbose
+
+# Custom maximum sequence region size
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --max-sequence-size 20
+
+# Include all regions (including those without Conv/MatMul etc.)
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --include-all-regions
+```
+
+Short option: `-m` for `--model`, `-v` for `--verbose`. Use this to verify region boundaries and counts before or during autotuning.
+
+## Using the Optimized Model
+
+Deploy with TensorRT:
+
+```bash
+trtexec --onnx=resnet50_results/optimized_final.onnx \
+        --saveEngine=resnet50.engine \
+        --stronglyTyped
+```
+
+## Pattern Cache
+
+Reuse learned patterns on similar models (warm-start):
+
+```bash
+# First optimization on ResNet50
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_run
+
+# Download and prepare ResNet101 (or any similar model)
+curl -L -o resnet101_Opset17.onnx https://github.com/onnx/models/blob/main/Computer_Vision/resnet101_Opset17_torch_hub/resnet101_Opset17.onnx
+python3 set_batch_size.py resnet101_Opset17.onnx --batch-size 128 --output resnet101.bs128.onnx
+
+# Reuse patterns from ResNet50 on ResNet101 
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet101.bs128.onnx \
+    --output_dir ./resnet101_run \
+    --pattern_cache ./resnet50_run/autotuner_state_pattern_cache.yaml
+```
+
+## Optimize from Existing QDQ Model
+
+If the user already have a quantized model, he can use it as a starting point to potentially find even better Q/DQ placements:
+
+```bash
+# Use an existing QDQ model as baseline (imports quantization patterns)
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_improved \
+    --qdq_baseline resnet50_quantized.onnx \
+    --schemes_per_region 40
+```
+
+This will:
+
+1. Extract Q/DQ insertion points from the baseline model
+2. Import them into the pattern cache as seed schemes
+3. Generate and test variations to find better placements
+4. Compare against the baseline performance
+
+**Use cases:**
+
+- **Improve existing quantization**: Fine-tune manually quantized models
+- **Compare tools**: Test if autotuner can beat other quantization methods
+- **Bootstrap optimization**: Start from expert-tuned schemes
+
+**Example workflow:**
+
+```bash
+# Step 1: Create initial quantized model with modelopt 
+# For example, using modelopt's quantize function:
+python3 -c "
+import numpy as np
+from modelopt.onnx.quantization import quantize
+
+# Create dummy calibration data (replace with real data for production)
+dummy_input = np.random.randn(128, 3, 224, 224).astype(np.float32)
+quantize(
+    'resnet50.bs128.onnx',
+    calibration_data=dummy_input,
+    calibration_method='entropy',
+    output_path='resnet50_quantized.onnx'
+)
+"
+
+# Step 2: Use the quantized baseline for autotuning
+# The autotuner will try to find better Q/DQ placements than the initial quantization
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_autotuned \
+    --qdq_baseline resnet50_quantized.onnx \
+    --schemes_per_region 50
+```
+
+**Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of the inference workload.
+
+## Remote Autotuning with TensorRT
+
+TensorRT 10.16+ supports remote autotuning, which allows TensorRT's optimization process to be offloaded to a remote hardware. This is useful when optimizing models for different target GPUs without having direct access to them.
+
+To use remote autotuning during Q/DQ placement optimization, run with `trtexec` and pass extra args:
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune \
+    --onnx_path resnet50.bs128.onnx \
+    --output_dir ./resnet50_remote_autotuned \
+    --schemes_per_region 50 \
+    --use_trtexec \
+    --trtexec_benchmark_args "--remoteAutoTuningConfig=\"<remote autotuning config>\""
+```
+
+**Requirements:**
+
+- TensorRT 10.16 or later
+- Valid remote autotuning configuration
+- `--use_trtexec` must be set (benchmarking uses `trtexec` instead of the TensorRT Python API)
+
+Replace `<remote autotuning config>` with user's actual remote autotuning configuration string. Other TensorRT benchmark options (e.g. `--timing_cache`, `--warmup_runs`, `--timing_runs`, `--plugin_libraries`) are also available; run `--help` for details.
+
+## Programmatic API Usage
+
+All examples above use the command-line interface. For **low-level programmatic control** in Python code, use the Python API directly. This allows user to:
+
+- Integrate autotuning into custom pipelines
+- Implement custom evaluation functions
+- Control state management and checkpointing
+- Build custom optimization workflows
+
+**See the API Reference documentation for low-level usage:**
+
+- [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
+
+The API docs include detailed examples of:
+
+- Using the `QDQAutotuner` class and `region_pattern_autotuning_workflow`
+- Customizing region discovery and scheme generation
+- Managing optimization state and pattern cache programmatically
+- Implementing custom performance evaluators (e.g. via `init_benchmark_instance` and `benchmark_onnx_model`)
+
+## Documentation
+
+For comprehensive documentation on QDQ placement optimization, see:
+
+- **User Guide**: [`docs/source/guides/9_qdq_placement.rst`](../../docs/source/guides/9_qdq_placement.rst)
+  - Detailed explanations of how the autotuner works
+  - Advanced usage patterns and best practices
+  - Configuration options and performance tuning
+  - Troubleshooting common issues
+
+- **API Reference**: [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
+  - Complete API documentation for all classes and functions
+  - Low-level usage examples
+  - State management and pattern cache details
+
+For command-line help and all options (e.g. `--state_file`, `--node_filter_list`, `--default_dq_dtype`, `--verbose`):
+
+```bash
+python3 -m modelopt.onnx.quantization.autotune --help
+```
diff --git a/examples/qdq_placement/set_batch_size.py b/examples/onnx/autoqdq/set_batch_size.py
similarity index 73%
rename from examples/qdq_placement/set_batch_size.py
rename to examples/onnx/autoqdq/set_batch_size.py
index 205dbb551..014bf07e9 100644
--- a/examples/qdq_placement/set_batch_size.py
+++ b/examples/onnx/autoqdq/set_batch_size.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,9 +25,25 @@
 """
 
 import argparse
+import sys
 
 import onnx
-from onnx import shape_inference
+
+from modelopt.onnx.utils import check_model, infer_shapes, save_onnx
+
+
+def _validate_onnx_model_path(path: str) -> None:
+    """Ensure the model path has a .onnx extension for consistent output path generation."""
+    if not path.lower().endswith(".onnx"):
+        print(f"Error: Model path must end with '.onnx', got: {path}", file=sys.stderr)
+        sys.exit(1)
+
+
+def _validate_batch_size(batch_size: int) -> None:
+    """Ensure batch size is a positive integer to prevent invalid model configurations."""
+    if batch_size < 1:
+        print(f"Error: Batch size must be a positive integer, got: {batch_size}", file=sys.stderr)
+        sys.exit(1)
 
 
 def set_batch_size(model_path: str, batch_size: int, output_path: str) -> None:
@@ -68,20 +84,21 @@ def set_batch_size(model_path: str, batch_size: int, output_path: str) -> None:
     )
 
     # Run shape inference to propagate the batch size through the model
+    # Use modelopt's infer_shapes to support models with external data and large models
     print("Running shape inference...")
     try:
-        model = shape_inference.infer_shapes(model)
+        model = infer_shapes(model)
     except Exception as e:
         print(f"Warning: Shape inference failed: {e}")
         print("Continuing without shape inference...")
 
-    # Save the modified model
+    # Save the modified model (handles external data and IR > max ORT supported)
     print(f"Saving modified model to {output_path}...")
-    onnx.save(model, output_path)
+    save_onnx(model, output_path)
 
-    # Verify the saved model
+    # Verify the saved model (handles external data and large models)
     print("Verifying model...")
-    onnx.checker.check_model(output_path)
+    check_model(model)
     print("✓ Model saved and verified successfully!")
 
 
@@ -109,9 +126,13 @@ def main():
 
     args = parser.parse_args()
 
-    # Generate output path if not provided
+    _validate_onnx_model_path(args.model)
+    _validate_batch_size(args.batch_size)
+
+    # Generate output path if not provided (requires .onnx extension, validated above)
     if args.output is None:
-        base_name = args.model.rsplit(".", 1)[0]
+        parts = args.model.rsplit(".", 1)
+        base_name = parts[0] if len(parts) == 2 else args.model
         args.output = f"{base_name}.bs{args.batch_size}.onnx"
 
     set_batch_size(args.model, args.batch_size, args.output)
diff --git a/examples/qdq_placement/README.md b/examples/qdq_placement/README.md
deleted file mode 100644
index beed4f007..000000000
--- a/examples/qdq_placement/README.md
+++ /dev/null
@@ -1,256 +0,0 @@
-# QDQ Placement Optimization Example
-
-This example demonstrates automated Q/DQ (Quantize/Dequantize) node placement optimization for ONNX models using TensorRT performance measurements.
-
-## Prerequisites
-
-### Get the Model
-
-Download the ResNet50 model from the ONNX Model Zoo:
-
-```bash
-# Download ResNet50 from ONNX Model Zoo
-curl -L -o resnet50_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet50_Opset17_torch_hub/resnet50_Opset17.onnx
-```
-
-### Set Fixed Batch Size (Recommended)
-
-The downloaded model has a dynamic batch size. For best performance with TensorRT benchmarking, set a fixed batch size:
-
-```bash
-# Set batch size to 128 using the provided script
-python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
-
-# Or for other batch sizes
-python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
-```
-
-This creates `resnet50.bs128.onnx` with a fixed batch size of 128, which is optimal for TensorRT performance benchmarking.
-
-**Note:** The script requires the `onnx` package. If you have modelopt installed, this dependency should already be available.
-
-### What's in This Directory
-
-- `set_batch_size.py` - Script to convert dynamic batch size models to fixed batch size
-- `README.md` - This guide
-
-**Note:** ONNX model files are not included in the repository (excluded via `.gitignore`). Download and prepare them using the instructions above.
-
-## Quick Start
-
-### Basic Usage
-
-Optimize the ResNet50 model with INT8 quantization:
-
-```bash
-# Using the fixed batch size model (recommended)
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_results \
-    --quant-type int8 \
-    --schemes-per-region 30
-
-# Or use the original dynamic batch size model
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50_Opset17.onnx \
-    --output ./resnet50_results \
-    --quant-type int8 \
-    --schemes-per-region 30
-```
-
-This will:
-
-1. Automatically discover optimization regions in your model
-2. Test 30 different Q/DQ placement schemes per region pattern
-3. Measure TensorRT performance for each scheme
-4. Export the best optimized model to `./resnet50_results/optimized_final.onnx`
-
-### FP8 Quantization
-
-For FP8 quantization (faster on modern GPUs):
-
-```bash
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_fp8_results \
-    --quant-type fp8 \
-    --schemes-per-region 50
-```
-
-### Faster Exploration
-
-For quick experiments, reduce the number of schemes:
-
-```bash
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_quick \
-    --schemes-per-region 15
-```
-
-## Output Structure
-
-After running, you'll get:
-
-```log
-resnet50_results/
-├── optimized_final.onnx              # Your optimized model
-├── baseline.onnx                     # Baseline for comparison
-├── autotuner_state.yaml              # Resume checkpoint
-├── autotuner_state_pattern_cache.yaml # Reusable patterns
-└── logs/
-    ├── baseline.log                  # TensorRT baseline log
-    ├── region_*_scheme_*.log         # Per-scheme logs
-    └── final.log                     # Final model log
-```
-
-## Using the Optimized Model
-
-Deploy with TensorRT:
-
-```bash
-trtexec --onnx=resnet50_results/optimized_final.onnx \
-        --saveEngine=resnet50.engine \
-        --stronglyTyped
-```
-
-## Pattern Cache
-
-Reuse learned patterns on similar models:
-
-```bash
-# First optimization on ResNet50
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_run
-
-# Download and prepare ResNet101 (or any similar model)
-curl -L -o resnet101_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet101-v2-7.onnx
-python3 set_batch_size.py resnet101_Opset17.onnx --batch-size 128 --output resnet101.bs128.onnx
-
-# Reuse patterns from ResNet50 on ResNet101 (much faster!)
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet101.bs128.onnx \
-    --output ./resnet101_run \
-    --pattern-cache ./resnet50_run/autotuner_state_pattern_cache.yaml
-```
-
-## Optimize from Existing QDQ Model
-
-If you already have a quantized model (e.g., from manual quantization or another tool), you can use it as a starting point to potentially find even better Q/DQ placements:
-
-```bash
-# Use an existing QDQ model as baseline
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_improved \
-    --qdq-baseline resnet50_quantized.onnx \
-    --schemes-per-region 40
-```
-
-This will:
-
-1. Extract Q/DQ insertion points from the baseline model
-2. Use them as seed schemes during optimization
-3. Generate and test variations to find better placements
-4. Compare against the baseline performance
-
-**Use cases:**
-
-- **Improve existing quantization**: Fine-tune manually quantized models
-- **Compare tools**: Test if autotuner can beat other quantization methods
-- **Bootstrap optimization**: Start from expert-tuned schemes
-
-**Example workflow:**
-
-```bash
-# Step 1: Create initial quantized model with any quantization tool
-# For example, using modelopt's quantize function:
-python3 -c "
-import numpy as np
-from modelopt.onnx.quantization import quantize
-
-# Create dummy calibration data (replace with real data for production)
-dummy_input = np.random.randn(128, 3, 224, 224).astype(np.float32)
-quantize(
-    'resnet50.bs128.onnx',
-    calibration_data=dummy_input,
-    calibration_method='entropy',
-    output_path='resnet50_quantized.onnx'
-)
-"
-
-# Step 2: Use the quantized baseline for autotuning
-# The autotuner will try to find better Q/DQ placements than the initial quantization
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_autotuned \
-    --qdq-baseline resnet50_quantized.onnx \
-    --schemes-per-region 50
-```
-
-**Note:** This example uses dummy calibration data. For production use, provide real calibration data representative of your inference workload.
-
-## Remote Autotuning with TensorRT
-
-TensorRT 10.16+ supports remote autotuning, which allows you to offload TensorRT's optimization process to remote hardware. This is useful when you want to optimize models for different target GPUs without having direct access to them.
-
-To use remote autotuning during Q/DQ placement optimization:
-
-```bash
-python3 -m modelopt.onnx.quantization.autotune \
-    --model resnet50.bs128.onnx \
-    --output ./resnet50_remote_autotuned \
-    --schemes-per-region 50 \
-    --use_trtexec \
-    --trtexec_benchmark_args "--remoteAutoTuningConfig=\"<remote autotuning config>\""
-```
-
-**Requirements:**
-
-- TensorRT 10.16 or later
-- Valid remote autotuning configuration
-- `--use_trtexec` flag must be enabled
-
-Replace `<remote autotuning config>` with your actual remote autotuning configuration string provided by your TensorRT setup.
-
-## Programmatic API Usage
-
-All examples above use the command-line interface. For **low-level programmatic control** in your Python code, use the Python API directly. This allows you to:
-
-- Integrate autotuning into custom pipelines
-- Implement custom evaluation functions
-- Control state management and checkpointing
-- Build custom optimization workflows
-
-**See the API Reference documentation for low-level usage:**
-
-- [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
-
-The API docs include detailed examples of:
-
-- Using the `Autotuner` class directly
-- Customizing region discovery and scheme generation
-- Managing optimization state programmatically
-- Implementing custom performance evaluators
-
-## Documentation
-
-For comprehensive documentation on QDQ placement optimization, see:
-
-- **User Guide**: [`docs/source/guides/9_qdq_placement.rst`](../../docs/source/guides/9_qdq_placement.rst)
-  - Detailed explanations of how the autotuner works
-  - Advanced usage patterns and best practices
-  - Configuration options and performance tuning
-  - Troubleshooting common issues
-
-- **API Reference**: [`docs/source/reference/2_qdq_placement.rst`](../../docs/source/reference/2_qdq_placement.rst)
-  - Complete API documentation for all classes and functions
-  - Low-level usage examples
-  - State management and pattern cache details
-
-For command-line help:
-
-```bash
-python3 -m modelopt.onnx.quantization.autotune --help
-```

From ee48afcd533ebab9a271e7e948a39addc27f8e49 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Wed, 4 Mar 2026 01:54:28 +0000
Subject: [PATCH 5/5] resolve comments

Signed-off-by: Will Guo <willg@nvidia.com>
---
 examples/onnx/autoqdq/README.md         |  75 ++++++-------
 examples/onnx/autoqdq/set_batch_size.py | 142 ------------------------
 2 files changed, 34 insertions(+), 183 deletions(-)
 delete mode 100644 examples/onnx/autoqdq/set_batch_size.py

diff --git a/examples/onnx/autoqdq/README.md b/examples/onnx/autoqdq/README.md
index 01772a993..30932302d 100644
--- a/examples/onnx/autoqdq/README.md
+++ b/examples/onnx/autoqdq/README.md
@@ -4,22 +4,22 @@ This example demonstrates automated Q/DQ (Quantize/Dequantize) node placement op
 
 ## Table of Contents
 
-- [Prerequisites](#prerequisites)
-  - [Get the Model](#get-the-model)
-  - [Set Fixed Batch Size](#set-fixed-batch-size)
-  - [What's in This Directory](#whats-in-this-directory)
-- [Quick Start](#quick-start)
-  - [Basic Usage](#basic-usage)
-  - [FP8 Quantization](#fp8-quantization)
-  - [Faster Exploration](#faster-exploration)
-- [Output Structure](#output-structure)
-- [Region Inspection](#region-inspection)
-- [Using the Optimized Model](#using-the-optimized-model)
-- [Pattern Cache](#pattern-cache)
-- [Optimize from Existing QDQ Model](#optimize-from-existing-qdq-model)
-- [Remote Autotuning with TensorRT](#remote-autotuning-with-tensorrt)
-- [Programmatic API Usage](#programmatic-api-usage)
-- [Documentation](#documentation)
+<div align="center">
+
+| **Section** | **Description** | **Link** | **Docs** |
+| :------------: | :------------: | :------------: | :------------: |
+| Prerequisites | Get the model, set fixed batch size, and directory overview | [Link](#prerequisites) | |
+| Quick Start | Basic usage, FP8 quantization, and faster exploration | [Link](#quick-start) | |
+| Output Structure | Output workspace layout and files | [Link](#output-structure) | |
+| Region Inspection | Debug region discovery and partitioning | [Link](#region-inspection) | |
+| Using the Optimized Model | Deploy with TensorRT | [Link](#using-the-optimized-model) | |
+| Pattern Cache | Reuse learned patterns on similar models | [Link](#pattern-cache) | |
+| Optimize from Existing QDQ Model | Start from an existing quantized model | [Link](#optimize-from-existing-qdq-model) | |
+| Remote Autotuning with TensorRT | Offload autotuning to remote hardware | [Link](#remote-autotuning-with-tensorrt) | |
+| Programmatic API Usage | Python API and low-level control | [Link](#programmatic-api-usage) | |
+| Documentation | User guide and API reference | [Link](#documentation) | [docs](https://nvidia.github.io/Model-Optimizer/) |
+
+</div>
 
 ## Prerequisites
 
@@ -34,23 +34,16 @@ curl -L -o resnet50_Opset17.onnx https://github.com/onnx/models/raw/main/Compute
 
 ### Set Fixed Batch Size
 
-The downloaded model has a dynamic batch size. For best performance with TensorRT benchmarking, set a fixed batch size:
+The downloaded model has a dynamic batch size. For best performance with TensorRT benchmarking, set a fixed batch size using Polygraphy:
 
 ```bash
-# Set batch size to 128 using the provided script
-python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
-
-# Or for other batch sizes
-python3 set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
+polygraphy surgeon sanitize --override-input-shapes x:[128,3,1024,1024] -o resnet50_Opset17_bs128.onnx resnet50_Opset17.onnx
 ```
 
-This creates `resnet50.bs128.onnx` with a fixed batch size of 128, which is optimal for TensorRT performance benchmarking.
-
-**Note:** The script requires the `onnx` package.
+For other batch sizes, change the first dimension in the shape (e.g. `x:[1,3,1024,1024]` for batch size 1).
 
 ### What's in This Directory
 
-- `set_batch_size.py` - Script to convert dynamic batch size models to fixed batch size
 - `README.md` - This guide
 
 **Note:** ONNX model files are not included in the repository (excluded via `.gitignore`). Download and prepare them using the instructions above.
@@ -64,7 +57,7 @@ Optimize the ResNet50 model with INT8 quantization:
 ```bash
 # Using the fixed batch size model
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_results \
     --quant_type int8 \
     --schemes_per_region 30
@@ -92,7 +85,7 @@ For FP8 quantization:
 
 ```bash
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_fp8_results \
     --quant_type fp8 \
     --schemes_per_region 50
@@ -104,7 +97,7 @@ For quick experiments, reduce the number of schemes:
 
 ```bash
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_quick \
     --schemes_per_region 15
 ```
@@ -133,16 +126,16 @@ To debug how the autotuner discovers and partitions regions in your model, use t
 
 ```bash
 # Basic inspection (regions with quantizable ops only)
-python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50_Opset17_bs128.onnx
 
 # Verbose mode for detailed debug logging
-python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --verbose
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50_Opset17_bs128.onnx --verbose
 
 # Custom maximum sequence region size
-python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --max-sequence-size 20
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50_Opset17_bs128.onnx --max-sequence-size 20
 
 # Include all regions (including those without Conv/MatMul etc.)
-python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50.bs128.onnx --include-all-regions
+python3 -m modelopt.onnx.quantization.autotune.region_inspect --model resnet50_Opset17_bs128.onnx --include-all-regions
 ```
 
 Short option: `-m` for `--model`, `-v` for `--verbose`. Use this to verify region boundaries and counts before or during autotuning.
@@ -164,16 +157,16 @@ Reuse learned patterns on similar models (warm-start):
 ```bash
 # First optimization on ResNet50
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_run
 
 # Download and prepare ResNet101 (or any similar model)
-curl -L -o resnet101_Opset17.onnx https://github.com/onnx/models/blob/main/Computer_Vision/resnet101_Opset17_torch_hub/resnet101_Opset17.onnx
-python3 set_batch_size.py resnet101_Opset17.onnx --batch-size 128 --output resnet101.bs128.onnx
+curl -L -o resnet101_Opset17.onnx https://github.com/onnx/models/raw/main/Computer_Vision/resnet101_Opset17_torch_hub/resnet101_Opset17.onnx
+polygraphy surgeon sanitize --override-input-shapes x:[128,3,1024,1024] -o resnet101_Opset17_bs128.onnx resnet101_Opset17.onnx
 
 # Reuse patterns from ResNet50 on ResNet101 
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet101.bs128.onnx \
+    --onnx_path resnet101_Opset17_bs128.onnx \
     --output_dir ./resnet101_run \
     --pattern_cache ./resnet50_run/autotuner_state_pattern_cache.yaml
 ```
@@ -185,7 +178,7 @@ If the user already have a quantized model, he can use it as a starting point to
 ```bash
 # Use an existing QDQ model as baseline (imports quantization patterns)
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_improved \
     --qdq_baseline resnet50_quantized.onnx \
     --schemes_per_region 40
@@ -216,7 +209,7 @@ from modelopt.onnx.quantization import quantize
 # Create dummy calibration data (replace with real data for production)
 dummy_input = np.random.randn(128, 3, 224, 224).astype(np.float32)
 quantize(
-    'resnet50.bs128.onnx',
+    'resnet50_Opset17_bs128.onnx',
     calibration_data=dummy_input,
     calibration_method='entropy',
     output_path='resnet50_quantized.onnx'
@@ -226,7 +219,7 @@ quantize(
 # Step 2: Use the quantized baseline for autotuning
 # The autotuner will try to find better Q/DQ placements than the initial quantization
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_autotuned \
     --qdq_baseline resnet50_quantized.onnx \
     --schemes_per_region 50
@@ -242,7 +235,7 @@ To use remote autotuning during Q/DQ placement optimization, run with `trtexec`
 
 ```bash
 python3 -m modelopt.onnx.quantization.autotune \
-    --onnx_path resnet50.bs128.onnx \
+    --onnx_path resnet50_Opset17_bs128.onnx \
     --output_dir ./resnet50_remote_autotuned \
     --schemes_per_region 50 \
     --use_trtexec \
diff --git a/examples/onnx/autoqdq/set_batch_size.py b/examples/onnx/autoqdq/set_batch_size.py
deleted file mode 100644
index 014bf07e9..000000000
--- a/examples/onnx/autoqdq/set_batch_size.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Script to set a fixed batch size for ONNX models.
-
-This script modifies an ONNX model with dynamic batch size to use a fixed batch size,
-which is often beneficial for TensorRT performance benchmarking.
-
-Usage:
-    python set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
-"""
-
-import argparse
-import sys
-
-import onnx
-
-from modelopt.onnx.utils import check_model, infer_shapes, save_onnx
-
-
-def _validate_onnx_model_path(path: str) -> None:
-    """Ensure the model path has a .onnx extension for consistent output path generation."""
-    if not path.lower().endswith(".onnx"):
-        print(f"Error: Model path must end with '.onnx', got: {path}", file=sys.stderr)
-        sys.exit(1)
-
-
-def _validate_batch_size(batch_size: int) -> None:
-    """Ensure batch size is a positive integer to prevent invalid model configurations."""
-    if batch_size < 1:
-        print(f"Error: Batch size must be a positive integer, got: {batch_size}", file=sys.stderr)
-        sys.exit(1)
-
-
-def set_batch_size(model_path: str, batch_size: int, output_path: str) -> None:
-    """
-    Set a fixed batch size for an ONNX model.
-
-    Args:
-        model_path: Path to input ONNX model
-        batch_size: Desired batch size
-        output_path: Path to save modified model
-    """
-    # Load the model
-    print(f"Loading model from {model_path}...")
-    model = onnx.load(model_path)
-
-    # Get the input tensor
-    graph = model.graph
-    input_tensor = graph.input[0]
-
-    print(
-        f"Original input shape: {[d.dim_param or d.dim_value for d in input_tensor.type.tensor_type.shape.dim]}"
-    )
-
-    # Modify the batch dimension (first dimension)
-    if len(input_tensor.type.tensor_type.shape.dim) > 0:
-        input_tensor.type.tensor_type.shape.dim[0].dim_value = batch_size
-        # Clear any symbolic dimension parameter
-        input_tensor.type.tensor_type.shape.dim[0].ClearField("dim_param")
-
-    # Also update output shapes if needed
-    for output_tensor in graph.output:
-        if len(output_tensor.type.tensor_type.shape.dim) > 0:
-            output_tensor.type.tensor_type.shape.dim[0].dim_value = batch_size
-            output_tensor.type.tensor_type.shape.dim[0].ClearField("dim_param")
-
-    print(
-        f"Modified input shape: {[d.dim_param or d.dim_value for d in input_tensor.type.tensor_type.shape.dim]}"
-    )
-
-    # Run shape inference to propagate the batch size through the model
-    # Use modelopt's infer_shapes to support models with external data and large models
-    print("Running shape inference...")
-    try:
-        model = infer_shapes(model)
-    except Exception as e:
-        print(f"Warning: Shape inference failed: {e}")
-        print("Continuing without shape inference...")
-
-    # Save the modified model (handles external data and IR > max ORT supported)
-    print(f"Saving modified model to {output_path}...")
-    save_onnx(model, output_path)
-
-    # Verify the saved model (handles external data and large models)
-    print("Verifying model...")
-    check_model(model)
-    print("✓ Model saved and verified successfully!")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Set a fixed batch size for an ONNX model",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Set batch size to 128 for ResNet50
-  python set_batch_size.py resnet50_Opset17.onnx --batch-size 128 --output resnet50.bs128.onnx
-
-  # Set batch size to 1 for single-image inference
-  python set_batch_size.py resnet50_Opset17.onnx --batch-size 1 --output resnet50.bs1.onnx
-        """,
-    )
-
-    parser.add_argument("model", help="Path to input ONNX model")
-    parser.add_argument(
-        "--batch-size", "-b", type=int, default=128, help="Batch size to set (default: 128)"
-    )
-    parser.add_argument(
-        "--output", "-o", help="Path to save modified model (default: <model>_bs<batch_size>.onnx)"
-    )
-
-    args = parser.parse_args()
-
-    _validate_onnx_model_path(args.model)
-    _validate_batch_size(args.batch_size)
-
-    # Generate output path if not provided (requires .onnx extension, validated above)
-    if args.output is None:
-        parts = args.model.rsplit(".", 1)
-        base_name = parts[0] if len(parts) == 2 else args.model
-        args.output = f"{base_name}.bs{args.batch_size}.onnx"
-
-    set_batch_size(args.model, args.batch_size, args.output)
-
-
-if __name__ == "__main__":
-    main()