From 3554ecfbb121dff9b79f2cc768913d9d3b94d12f Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Fri, 23 Jan 2026 20:04:15 -0500
Subject: [PATCH 01/42] Initial autotune codebase

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/cli.py | 294 +++++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 modelopt/onnx/quantization/autotune/cli.py

diff --git a/modelopt/onnx/quantization/autotune/cli.py b/modelopt/onnx/quantization/autotune/cli.py
new file mode 100644
index 000000000..a5809f9a5
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/cli.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CLI argument parsing and execution for ONNX Q/DQ autotuning.
+
+This module provides `run_autotune` which handles both argument parsing and
+workflow execution. See `__main__.py` for usage examples.
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+
+DEFAULT_OUTPUT_DIR = "./autotuner_output"
+DEFAULT_NUM_SCHEMES = 30
+DEFAULT_QUANT_TYPE = "int8"
+DEFAULT_DQ_DTYPE = "float32"
+DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache"  # nosec B108
+DEFAULT_WARMUP_RUNS = 5
+DEFAULT_TIMING_RUNS = 20
+
+
+def validate_file_path(path: str | None, description: str) -> Path | None:
+    """Validate that a file path exists.
+
+    Args:
+        path: Path string to validate (can be None)
+        description: Description of the file for error messages
+
+    Returns:
+        Path object if valid, None if path is None
+
+    Raises:
+        SystemExit: If path is provided but doesn't exist
+    """
+    if path is None:
+        return None
+
+    path_obj = Path(path)
+    if not path_obj.exists():
+        logger.error(f"{description} not found: {path_obj}")
+        sys.exit(1)
+
+    return path_obj
+
+
+def log_benchmark_config(args):
+    """Log TensorRT benchmark configuration for transparency.
+
+    Logs timing cache path, warmup/timing run counts, and any custom
+    plugin libraries that will be loaded.
+
+    Args:
+        args: Parsed command-line arguments with benchmark configuration
+    """
+    logger.info("Initializing TensorRT benchmark")
+    logger.info(f"  Timing cache: {args.timing_cache}")
+    logger.info(f"  Warmup runs: {args.warmup_runs}")
+    logger.info(f"  Timing runs: {args.timing_runs}")
+    if args.plugin_libraries:
+        logger.info(f"  Plugin libraries: {', '.join(args.plugin_libraries)}")
+
+
+def run_autotune(args=None) -> int:
+    """Execute the complete pattern-based Q/DQ autotuning workflow.
+
+    This function orchestrates the entire optimization process:
+    1. Parses command-line arguments (if not provided)
+    2. Validates input paths (model, baseline, output directory)
+    3. Initializes TensorRT benchmark instance
+    4. Runs pattern-based region autotuning workflow
+    5. Handles interruptions gracefully with state preservation
+
+    Args:
+        args: Optional parsed command-line arguments. If None, parses sys.argv.
+
+    Returns:
+        Exit code:
+        - 0: Success
+        - 1: Autotuning failed (exception occurred)
+        - 130: Interrupted by user (Ctrl+C)
+    """
+    if args is None:
+        args = _get_autotune_parser().parse_args()
+
+    model_path = validate_file_path(args.onnx_path, "Model file")
+    validate_file_path(args.qdq_baseline, "QDQ baseline model")
+    output_dir = Path(args.output)
+
+    log_benchmark_config(args)
+    init_benchmark_instance(
+        use_trtexec=args.use_trtexec,
+        plugin_libraries=args.plugin_libraries,
+        timing_cache_file=args.timing_cache,
+        warmup_runs=args.warmup_runs,
+        timing_runs=args.timing_runs,
+    )
+
+    logger.info("Autotuning Mode: Pattern-Based")
+
+    try:
+        node_filter_list = None
+        if args.node_filter_list:
+            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
+            if filter_file:
+                with open(filter_file) as f:
+                    node_filter_list = [
+                        line.strip()
+                        for line in f
+                        if line.strip() and not line.strip().startswith("#")
+                    ]
+                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+
+        region_pattern_autotuning_workflow(
+            model_path=str(model_path),
+            output_dir=output_dir,
+            num_schemes_per_region=args.num_schemes,
+            pattern_cache_file=args.pattern_cache_file,
+            state_file=args.state_file,
+            quant_type=args.quant_type,
+            default_dq_dtype=args.default_dq_dtype,
+            qdq_baseline_model=args.qdq_baseline,
+            node_filter_list=node_filter_list,
+        )
+
+        logger.info("\n" + "=" * 70)
+        logger.info("✓ Autotuning completed successfully!")
+        logger.info(f"✓ Results: {output_dir}")
+        logger.info("=" * 70)
+        return 0
+
+    except KeyboardInterrupt:
+        logger.warning("\nInterrupted by user")
+        state_file = args.state_file or output_dir / "autotuner_state.yaml"
+        logger.info(f"Progress saved to: {state_file}")
+        return 130
+
+    except Exception as e:
+        logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose)
+        return 1
+
+
+def _get_autotune_parser() -> argparse.ArgumentParser:
+    """Create and configure the command-line argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="modelopt.onnx.quantization.autotune",
+        description="ONNX Q/DQ Autotuning with TensorRT",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx
+
+  # Import patterns from QDQ baseline model
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --qdq_baseline baseline.onnx
+
+  # Use pattern cache for warm-start
+  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml
+
+  # Full example with all options
+  python -m modelopt.onnx.quantization.autotune \\
+      --onnx_path model.onnx --schemes_per_region 50 \\
+      --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\
+      --quant_type int8 --verbose
+        """,
+    )
+
+    # Model and Output
+    io_group = parser.add_argument_group("Model and Output")
+    io_group.add_argument(
+        "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file"
+    )
+    io_group.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default=DEFAULT_OUTPUT_DIR,
+        help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})",
+    )
+
+    # Autotuning Strategy
+    strategy_group = parser.add_argument_group("Autotuning Strategy")
+    strategy_group.add_argument(
+        "--schemes_per_region",
+        "-s",
+        type=int,
+        default=DEFAULT_NUM_SCHEMES,
+        dest="num_schemes",
+        help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})",
+    )
+    strategy_group.add_argument(
+        "--pattern_cache",
+        type=str,
+        default=None,
+        dest="pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start (optional)",
+    )
+    strategy_group.add_argument(
+        "--qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to QDQ baseline ONNX model to import quantization patterns (optional)",
+    )
+    strategy_group.add_argument(
+        "--state_file",
+        type=str,
+        default=None,
+        help="State file path for resume capability (default: <output>/autotuner_state.yaml)",
+    )
+    strategy_group.add_argument(
+        "--node_filter_list",
+        type=str,
+        default=None,
+        help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+        "Regions without any matching nodes are skipped during autotuning.",
+    )
+
+    # Quantization
+    quant_group = parser.add_argument_group("Quantization")
+    quant_group.add_argument(
+        "--quant_type",
+        type=str,
+        default=DEFAULT_QUANT_TYPE,
+        choices=["int8", "fp8"],
+        help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})",
+    )
+    quant_group.add_argument(
+        "--default_dq_dtype",
+        type=str,
+        default=DEFAULT_DQ_DTYPE,
+        choices=["float16", "float32", "bfloat16"],
+        help="Default DQ output dtype if cannot be deduced (optional)",
+    )
+
+    # TensorRT Benchmark
+    trt_group = parser.add_argument_group("TensorRT Benchmark")
+    trt_group.add_argument(
+        "--use_trtexec",
+        action="store_true",
+        help="Use trtexec for benchmarking (default: False)",
+        default=False,
+    )
+    trt_group.add_argument(
+        "--timing_cache",
+        type=str,
+        default=DEFAULT_TIMING_CACHE,
+        help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})",
+    )
+    trt_group.add_argument(
+        "--warmup_runs",
+        type=int,
+        default=DEFAULT_WARMUP_RUNS,
+        help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})",
+    )
+    trt_group.add_argument(
+        "--timing_runs",
+        type=int,
+        default=DEFAULT_TIMING_RUNS,
+        help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})",
+    )
+    trt_group.add_argument(
+        "--plugin_libraries",
+        "--plugins",
+        type=str,
+        nargs="+",
+        default=None,
+        dest="plugin_libraries",
+        help="TensorRT plugin libraries (.so files) to load (optional, space-separated)",
+    )
+
+    # Logging
+    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging")
+
+    return parser

From 56a67e3175778e8a49db26ddd33e4161dc5efe05 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 26 Jan 2026 10:21:21 -0500
Subject: [PATCH 02/42] Add more tests

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../onnx/quantization/autotune/test_config.py | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 tests/unit/onnx/quantization/autotune/test_config.py

diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
new file mode 100644
index 000000000..db6b02aa3
--- /dev/null
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for the Config class in the autotuner.
+
+Tests configuration parameter validation and defaults.
+"""
+
+import os
+import sys
+import unittest
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from modelopt.onnx.quantization.autotune.common import Config
+
+
+class TestConfig(unittest.TestCase):
+    """Test Config class functionality."""
+
+    def test_default_values(self):
+        """Test that Config has correct default values."""
+        config = Config()
+
+        # Logging
+        assert not config.verbose
+
+        # Performance thresholds
+
+        # Q/DQ defaults
+        assert config.default_q_scale == 0.1
+        assert config.default_q_zero_point == 0
+        assert config.default_quant_type == "int8"
+
+        # Region builder settings
+        assert config.maximum_sequence_region_size == 10
+        assert config.minimum_topdown_search_size == 10
+
+        # Scheme generation parameters
+        assert config.top_percent_to_mutate == 0.1
+        assert config.minimum_schemes_to_mutate == 10
+        assert config.maximum_mutations == 3
+        assert config.maximum_generation_attempts == 100
+
+        # Pattern cache parameters
+        assert config.pattern_cache_minimum_distance == 4
+        assert config.pattern_cache_max_entries_per_pattern == 32
+
+        print("✓ Config default values are correct")
+
+    def test_custom_values(self):
+        """Test creating Config with custom values."""
+        config = Config(
+            verbose=True,
+            default_q_scale=0.05,
+            default_q_zero_point=128,
+            default_quant_type="fp8",
+            maximum_sequence_region_size=20,
+        )
+
+        assert config.verbose
+        assert config.default_q_scale == 0.05
+        assert config.default_q_zero_point == 128
+        assert config.default_quant_type == "fp8"
+        assert config.maximum_sequence_region_size == 20
+        print("✓ Config custom values work correctly")
+
+    def test_region_size_validation(self):
+        """Test that region size parameters are positive."""
+        config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5)
+        assert config.maximum_sequence_region_size > 0
+        assert config.minimum_topdown_search_size > 0
+        print("✓ Config region size validation")
+
+    def test_genetic_algorithm_params(self):
+        """Test genetic algorithm parameters."""
+        config = Config(
+            top_percent_to_mutate=0.2,
+            minimum_schemes_to_mutate=2,
+            maximum_mutations=5,
+            maximum_generation_attempts=50,
+        )
+
+        assert config.top_percent_to_mutate == 0.2
+        assert config.minimum_schemes_to_mutate == 2
+        assert config.maximum_mutations == 5
+        assert config.maximum_generation_attempts == 50
+        print("✓ Config genetic algorithm parameters")
+
+    def test_pattern_cache_params(self):
+        """Test pattern cache parameters."""
+        config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10)
+
+        assert config.pattern_cache_minimum_distance == 3
+        assert config.pattern_cache_max_entries_per_pattern == 10
+        print("✓ Config pattern cache parameters")
+
+
+def run_tests():
+    """Run all Config tests."""
+    print("=" * 70)
+    print("Config Class Test Suite")
+    print("=" * 70)
+
+    loader = unittest.TestLoader()
+    suite = unittest.TestSuite()
+    suite.addTests(loader.loadTestsFromTestCase(TestConfig))
+
+    runner = unittest.TextTestRunner(verbosity=2)
+    result = runner.run(suite)
+
+    print("\n" + "=" * 70)
+    print("Test Summary")
+    print("=" * 70)
+    print(f"Tests run: {result.testsRun}")
+    print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}")
+    print(f"Failures: {len(result.failures)}")
+    print(f"Errors: {len(result.errors)}")
+
+    if result.wasSuccessful():
+        print("\n✓ All Config tests passed!")
+        return 0
+    else:
+        print("\n✗ Some tests failed")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(run_tests())

From 6d58b4a5482a590d1f079ee2e112b3208765f57c Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 26 Jan 2026 11:42:52 -0500
Subject: [PATCH 03/42] Refactor: PR #702

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/common.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index d3b3de272..0c56a608a 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -531,12 +531,19 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
                         else:
                             # Existing scheme is better, skip new one
                             too_similar = True
+                            if scheme.latency_ms < existing_scheme.latency_ms:
+                                # New scheme is better, mark existing for replacement
+                                schemes_to_replace.append(existing_scheme)
                             break
 
                 if existing_to_remove is not None:
                     filtered_schemes.remove(existing_to_remove)
                 if not too_similar:
                     filtered_schemes.append(scheme)
+                elif schemes_to_replace:
+                    for scheme_to_replace in schemes_to_replace:
+                        filtered_schemes.remove(scheme_to_replace)
+                    filtered_schemes.append(scheme)
 
             sorted_schemes = filtered_schemes
 

From 710319ad58e15176bfb84641f54ecc6676fecaf5 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 26 Jan 2026 11:55:15 -0500
Subject: [PATCH 04/42] Remove python path in tests

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 tests/unit/onnx/quantization/autotune/test_config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
index db6b02aa3..c5b20a8a9 100644
--- a/tests/unit/onnx/quantization/autotune/test_config.py
+++ b/tests/unit/onnx/quantization/autotune/test_config.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #

From 98a60b55c87dd91678189ea23325397cb0ba7dfb Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 27 Jan 2026 12:00:16 -0500
Subject: [PATCH 05/42] Recover docstrings and simplify code (->, , )

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/common.py        | 1 +
 tests/unit/onnx/quantization/autotune/test_region.py | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index 0c56a608a..fe22e19c0 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -524,6 +524,7 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
                     distance = scheme.distance(existing_scheme)
                     if distance < self.minimum_distance:
                         # Schemes are too similar, keep the better one
+                        too_similar = True
                         if scheme.latency_ms < existing_scheme.latency_ms:
                             # New scheme is better; mark existing for removal
                             existing_to_remove = existing_scheme
diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py
index 5a733017d..3bbf34ac9 100644
--- a/tests/unit/onnx/quantization/autotune/test_region.py
+++ b/tests/unit/onnx/quantization/autotune/test_region.py
@@ -59,7 +59,6 @@ def test_parent_child_relationship(parent_with_children):
     assert parent.get_children() == [child1, child2]
     assert child1.parent == child2.parent == parent
 
-
 def test_add_and_get_nodes(leaf):
     leaf.nodes.update([0, 1, 2])
     assert set(leaf.get_nodes()) == {0, 1, 2}
@@ -79,7 +78,6 @@ def test_region_size_recursive(parent_with_children):
     parent.nodes.add(5)
     assert len(parent.get_region_nodes_and_descendants()) == 6
 
-
 def test_metadata(leaf):
     leaf.metadata.update({"pattern": "Conv->Relu", "quantizable": "true"})
     assert leaf.metadata == {"pattern": "Conv->Relu", "quantizable": "true"}
@@ -109,4 +107,4 @@ def test_remove_child():
     parent.add_child(child)
     parent.remove_child(child)
     assert parent.get_children() == []
-    assert child.parent is None
+    assert child.parent is None
\ No newline at end of file

From 91cef9c02288e27b5756621d608aaf012cd88e1a Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 27 Jan 2026 16:03:02 -0500
Subject: [PATCH 06/42] Added unittest for workflows.py (failing)

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../quantization/autotune/test_workflows.py   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 tests/unit/onnx/quantization/autotune/test_workflows.py

diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py
new file mode 100644
index 000000000..40a323dce
--- /dev/null
+++ b/tests/unit/onnx/quantization/autotune/test_workflows.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+import onnx
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model
+
+from modelopt.onnx.quantization.autotune.workflows import region_pattern_autotuning_workflow
+
+
+class TestWorkflows(unittest.TestCase):
+    """Test workflows functionality."""
+
+    def test_export_quantized_model(self):
+        """Test exporting quantized model with Q/DQ."""
+        model = create_simple_conv_model()
+
+        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+            baseline_model_path = f.name
+
+        # Save baseline model
+        onnx.save(model, baseline_model_path)
+
+        output_dir = baseline_model_path.strip(".onnx")
+        output_path = output_dir + ".quant.onnx"
+
+        try:
+            autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
+
+            # Export model with Q/DQ insertion
+            autotuner.export_onnx(output_path, insert_qdq=True)
+
+            # Verify file was created
+            assert os.path.exists(output_path)
+
+            # Verify it's a valid ONNX model
+            exported_model = onnx.load(output_path)
+            assert exported_model is not None
+
+            # Verify that it contains Q/DQ nodes
+            qdq_nodes = [
+                n
+                for n in exported_model.graph.node
+                if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
+            ]
+            assert qdq_nodes, "Q/DQ nodes not found in quantized model"
+
+            print("✓ QDQAutotuner export quantized model")
+        finally:
+            if os.path.exists(output_path):
+                os.unlink(output_path)

From 7937cc25fd806277905d856b9e80082a96e1754b Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 27 Jan 2026 16:06:37 -0500
Subject: [PATCH 07/42] Fix: 'Autotuning failed: 'PatternSchemes' object has no
 attribute 'node_inputs''

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/region_pattern.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/autotune/region_pattern.py b/modelopt/onnx/quantization/autotune/region_pattern.py
index a32273f84..9f80bd56e 100644
--- a/modelopt/onnx/quantization/autotune/region_pattern.py
+++ b/modelopt/onnx/quantization/autotune/region_pattern.py
@@ -21,7 +21,7 @@
 import onnx_graphsurgeon as gs
 
 from modelopt.onnx.op_types import get_symmetric_ops
-from modelopt.onnx.quantization.autotune.common import InsertionScheme, Region
+from modelopt.onnx.quantization.autotune.common import InsertionScheme, PatternSchemes, Region
 from modelopt.onnx.quantization.autotune.insertion_points import (
     ChildRegionInputInsertionPoint,
     ChildRegionOutputInsertionPoint,
@@ -161,6 +161,9 @@ def matches(
                        is provided but other is not a Region
             TypeError: If other is neither RegionPattern nor Region
         """
+        if isinstance(scheme, PatternSchemes):
+            return set()
+
         if isinstance(other, RegionPattern):
             if scheme is not None:
                 raise ValueError("scheme parameter can only be used when matching against a Region")

From 7c4e14b3912b90f25b2fb7d7bbfc44c78c14d9eb Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:38:48 -0500
Subject: [PATCH 08/42] Updated workflow test to test TRT and PythonTRT
 benchmarking

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../quantization/autotune/test_workflows.py   | 74 ++++++++++---------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py
index 40a323dce..c81794e5b 100644
--- a/tests/unit/onnx/quantization/autotune/test_workflows.py
+++ b/tests/unit/onnx/quantization/autotune/test_workflows.py
@@ -16,57 +16,65 @@
 import os
 import sys
 import tempfile
-import unittest
 from pathlib import Path
 
 import onnx
+import pytest
 
 # Add parent directory to path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec
 from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model
 
-from modelopt.onnx.quantization.autotune.workflows import region_pattern_autotuning_workflow
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
 
 
-class TestWorkflows(unittest.TestCase):
-    """Test workflows functionality."""
+@pytest.mark.parametrize("use_trtexec", [True, False])
+def test_export_quantized_model(use_trtexec):
+    """Test exporting quantized model with Q/DQ."""
+    if use_trtexec:
+        skip_if_no_trtexec()
+    else:
+        skip_if_no_tensorrt()
 
-    def test_export_quantized_model(self):
-        """Test exporting quantized model with Q/DQ."""
-        model = create_simple_conv_model()
+    model = create_simple_conv_model()
 
-        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
-            baseline_model_path = f.name
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        baseline_model_path = f.name
 
-        # Save baseline model
-        onnx.save(model, baseline_model_path)
+    # Save baseline model
+    onnx.save(model, baseline_model_path)
 
-        output_dir = baseline_model_path.strip(".onnx")
-        output_path = output_dir + ".quant.onnx"
+    output_dir = baseline_model_path.strip(".onnx")
+    output_path = output_dir + ".quant.onnx"
 
-        try:
-            autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
+    try:
+        init_benchmark_instance(use_trtexec=False)
+        autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
 
-            # Export model with Q/DQ insertion
-            autotuner.export_onnx(output_path, insert_qdq=True)
+        # Export model with Q/DQ insertion
+        autotuner.export_onnx(output_path, insert_qdq=True)
 
-            # Verify file was created
-            assert os.path.exists(output_path)
+        # Verify file was created
+        assert os.path.exists(output_path)
 
-            # Verify it's a valid ONNX model
-            exported_model = onnx.load(output_path)
-            assert exported_model is not None
+        # Verify it's a valid ONNX model
+        exported_model = onnx.load(output_path)
+        assert exported_model is not None
 
-            # Verify that it contains Q/DQ nodes
-            qdq_nodes = [
-                n
-                for n in exported_model.graph.node
-                if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
-            ]
-            assert qdq_nodes, "Q/DQ nodes not found in quantized model"
+        # Verify that it contains Q/DQ nodes
+        qdq_nodes = [
+            n
+            for n in exported_model.graph.node
+            if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
+        ]
+        assert qdq_nodes, "Q/DQ nodes not found in quantized model"
 
-            print("✓ QDQAutotuner export quantized model")
-        finally:
-            if os.path.exists(output_path):
-                os.unlink(output_path)
+        print("✓ QDQAutotuner export quantized model")
+    finally:
+        if os.path.exists(output_path):
+            os.unlink(output_path)

From 64836edf261d5da68152aacd1397dad33854bdfc Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Wed, 28 Jan 2026 13:30:34 -0500
Subject: [PATCH 09/42] Fix test: use_trtexec flag

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 tests/unit/onnx/quantization/autotune/test_workflows.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py
index c81794e5b..c8edafc06 100644
--- a/tests/unit/onnx/quantization/autotune/test_workflows.py
+++ b/tests/unit/onnx/quantization/autotune/test_workflows.py
@@ -53,7 +53,7 @@ def test_export_quantized_model(use_trtexec):
     output_path = output_dir + ".quant.onnx"
 
     try:
-        init_benchmark_instance(use_trtexec=False)
+        init_benchmark_instance(use_trtexec=use_trtexec)
         autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
 
         # Export model with Q/DQ insertion

From a9af36afbf2e4e542e7a02a376ee7beadcd6e806 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Thu, 19 Feb 2026 12:26:49 -0500
Subject: [PATCH 10/42] Add real scales to Q/DQ nodes

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/__main__.py        |   8 ++
 .../onnx/quantization/autotune/workflows.py   |  26 ++++-
 modelopt/onnx/quantization/fp8.py             |  24 +++--
 modelopt/onnx/quantization/int8.py            |  37 ++++---
 modelopt/onnx/quantization/ort_utils.py       |  56 +++++-----
 modelopt/onnx/quantization/quantize.py        | 102 +++++++++++++++---
 modelopt/onnx/utils.py                        |  18 ++++
 7 files changed, 201 insertions(+), 70 deletions(-)

diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index 6c79d9317..433980249 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -295,6 +295,13 @@ def get_parser() -> argparse.ArgumentParser:
             "if certain operations require a higher version."
         ),
     )
+    argparser.add_argument(
+        "--autotune",
+        action="store_true",
+        help=(
+            "If set, detect optimal Q/DQ node placements according to the TensorRT version and platform available."
+        ),
+    )
     return argparser
 
 
@@ -362,6 +369,7 @@ def main():
         calibrate_per_node=args.calibrate_per_node,
         direct_io_types=args.direct_io_types,
         opset=args.opset,
+        autotune=args.autotune,
     )
 
 
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index 025d9fac4..57ad73015 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -20,6 +20,8 @@
 """
 
 import fnmatch
+import shutil
+import tempfile
 from pathlib import Path
 
 import onnx
@@ -158,8 +160,8 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool:
 
 
 def region_pattern_autotuning_workflow(
-    model_path: str,
-    output_dir: Path,
+    model_path: str | onnx.ModelProto,
+    output_dir: Path | None = None,
     num_schemes_per_region: int = 30,
     pattern_cache_file: str | None = None,
     state_file: str | None = None,
@@ -168,6 +170,7 @@ def region_pattern_autotuning_workflow(
     qdq_baseline_model: str | None = None,
     node_filter_list: list[str] | None = None,
     verbose: bool = False,
+    keep_output_dir: bool = False,
 ) -> QDQAutotuner:
     """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model.
 
@@ -196,7 +199,7 @@ def region_pattern_autotuning_workflow(
 
     Args:
         model_path: Path to ONNX model file to optimize
-        output_dir: Directory for output files (state, logs, models). Created if doesn't exist.
+        output_dir: Directory for output files (state, logs, models). Created if it doesn't exist.
         num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern.
                                Higher values explore more configurations but take longer (default: 30)
         pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes
@@ -211,10 +214,14 @@ def region_pattern_autotuning_workflow(
         node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions
                          without any matching nodes are skipped during autotuning (default: None)
         verbose: Enable verbose logging in Config for detailed autotuner output (default: False)
+        keep_output_dir: If True, keep output_dir, otherwise, remove it at the end of this function.
 
     Returns:
         QDQAutotuner instance after autotuning
     """
+    if not output_dir:
+        output_dir = Path(tempfile.mkdtemp())
+
     output_dir.mkdir(parents=True, exist_ok=True)
     logs_dir = output_dir / "logs"
     logs_dir.mkdir(exist_ok=True)
@@ -225,8 +232,11 @@ def region_pattern_autotuning_workflow(
         state_file = str(output_dir / "autotuner_state.yaml")
     state_path = Path(state_file)
 
-    logger.info(f"Loading model: {model_path}")
-    model = onnx.load(model_path)
+    if isinstance(model_path, str):
+        logger.info(f"Loading model: {model_path}")
+        model = onnx.load(model_path)
+    else:
+        model = model_path
 
     pattern_cache = None
     if pattern_cache_file:
@@ -373,4 +383,10 @@ def region_pattern_autotuning_workflow(
     logger.debug(f"  Logs: {logs_dir}")
     logger.debug(f"  Region models: {models_dir}")
 
+    if not keep_output_dir:
+        logger.debug(
+            f"Removing output dir: {output_dir}. Select 'keep_output_dir=False' if you wish to keep it."
+        )
+        shutil.rmtree(output_dir)
+
     return autotuner
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
index 76a3e8167..d8e0349ca 100755
--- a/modelopt/onnx/quantization/fp8.py
+++ b/modelopt/onnx/quantization/fp8.py
@@ -183,6 +183,8 @@ def quantize(
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
+    no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] | None = None,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies FP8 GEMM only quantization to an ONNX file.
@@ -215,7 +217,7 @@ def quantize(
     op_types_to_quantize.extend(list(custom_ops_to_quantize))
 
     enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True)
-    if enable_gemv_detection_for_trt:
+    if enable_gemv_detection_for_trt and not autotune:
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
@@ -233,7 +235,8 @@ def quantize(
 
     # Collect node names to exclude from quantization
     nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude)  # type: ignore[arg-type]
-    nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8"))
+    if not autotune:
+        nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8"))
 
     # Change the default configuration of ORT quantization
     op_types = {node.op for node in graph.nodes}
@@ -244,19 +247,22 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
+        autotune,
     )
     logger.info(
         f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}"
     )
 
     # Collect node names to include in quantization
-    no_quantize_inputs = []
-    nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
-    if not nodes_to_quantize:
-        quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
-            graph, quantizable_op_types, nodes_to_exclude
-        )
-        nodes_to_quantize = [node.name for node in quantizable_nodes]
+    nodes_to_quantize = nodes_to_quantize or []
+    no_quantize_inputs = no_quantize_inputs or []
+    if not autotune:
+        nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
+        if not nodes_to_quantize:
+            quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
+                graph, quantizable_op_types, nodes_to_exclude
+            )
+            nodes_to_quantize = [node.name for node in quantizable_nodes]
 
     # Update the list of nodes to quantize
     nodes_to_quantize = [
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
index 6e350a16f..b1f781fb5 100755
--- a/modelopt/onnx/quantization/int8.py
+++ b/modelopt/onnx/quantization/int8.py
@@ -133,6 +133,8 @@ def quantize(
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
+    no_quantize_inputs: list[tuple[Node, Node, str]] | None = None,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics.
@@ -157,7 +159,7 @@ def quantize(
         return onnx_model
 
     enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True)
-    if enable_gemv_detection_for_trt:
+    if enable_gemv_detection_for_trt and not autotune:
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
@@ -175,7 +177,8 @@ def quantize(
 
     # Collect node names to exclude from quantization
     nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude)  # type: ignore[arg-type]
-    nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8"))
+    if not autotune:
+        nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8"))
 
     # Change the default configuration of ORT quantization
     op_types_to_quantize = op_types_to_quantize or []
@@ -189,22 +192,27 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
+        autotune,
     )
     logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}")
 
     # Collect node names to include in quantization
-    no_quantize_inputs = []
-    nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
-    if not nodes_to_quantize:
-        # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list
-        nodes_to_quantize = [node.name for node in graph.nodes if node.op in op_types_to_quantize]
-
-        # If op_types_to_quantize is not provided, use default QDQ placement algorithm
+    nodes_to_quantize = nodes_to_quantize or []
+    no_quantize_inputs = no_quantize_inputs or []
+    if not autotune:
+        nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
         if not nodes_to_quantize:
-            quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
-                graph, quantizable_op_types, nodes_to_exclude
-            )
-            nodes_to_quantize = [node.name for node in quantizable_nodes]
+            # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list
+            nodes_to_quantize = [
+                node.name for node in graph.nodes if node.op in op_types_to_quantize
+            ]
+
+            # If op_types_to_quantize is not provided, use default QDQ placement algorithm
+            if not nodes_to_quantize:
+                quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
+                    graph, quantizable_op_types, nodes_to_exclude
+                )
+                nodes_to_quantize = [node.name for node in quantizable_nodes]
 
     # Read the calibration cache and quantize nodes for which activation scale values are cached
     if calibration_cache_path:
@@ -220,7 +228,8 @@ def quantize(
         logger.info(
             f"Skipping quantization of nodes: {set(nodes_to_quantize) - set(iq_quantized_nodes)}"
         )
-        nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes))
+        if not autotune:
+            nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes))
 
     # Update the list of nodes to quantize
     nodes_to_quantize = [
diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py
index 5c89e20d7..089c8850f 100755
--- a/modelopt/onnx/quantization/ort_utils.py
+++ b/modelopt/onnx/quantization/ort_utils.py
@@ -271,6 +271,7 @@ def configure_ort(
     calibration_eps: list[str] | None = None,
     calibrate_per_node: bool = False,
     custom_ops_to_quantize: list[str] = [],
+    autotune: bool = False,
 ):
     """Configure and patches ORT to support ModelOpt ONNX quantization."""
     logger.info("Configuring ORT for ModelOpt ONNX quantization")
@@ -289,33 +290,34 @@ def configure_ort(
     # Patch ORT modules to fix bugs and support some edge cases
     patch_ort_modules(calibrate_per_node)
 
-    # Remove copy, reduction and activation ops from ORT QDQ registry
-    logger.debug("Removing non-quantizable ops from QDQ registry")
-    for op_type in [
-        "ArgMax",
-        "Concat",
-        "EmbedLayerNormalization",
-        "Gather",
-        "GatherElements",
-        "GatherND",
-        "InstanceNormalization",
-        "LeakyRelu",
-        "Pad",
-        "Relu",
-        "Reshape",
-        "Slice",
-        "Sigmoid",
-        "Softmax",
-        "Split",
-        "Squeeze",
-        "Transpose",
-        "Unsqueeze",
-        "Where",
-    ]:
-        if op_type in QLinearOpsRegistry:
-            del QLinearOpsRegistry[op_type]
-        if op_type in QDQRegistry:
-            del QDQRegistry[op_type]
+    if not autotune:
+        # Remove copy, reduction and activation ops from ORT QDQ registry
+        logger.debug("Removing non-quantizable ops from QDQ registry")
+        for op_type in [
+            "ArgMax",
+            "Concat",
+            "EmbedLayerNormalization",
+            "Gather",
+            "GatherElements",
+            "GatherND",
+            "InstanceNormalization",
+            "LeakyRelu",
+            "Pad",
+            "Relu",
+            "Reshape",
+            "Slice",
+            "Sigmoid",
+            "Softmax",
+            "Split",
+            "Squeeze",
+            "Transpose",
+            "Unsqueeze",
+            "Where",
+        ]:
+            if op_type in QLinearOpsRegistry:
+                del QLinearOpsRegistry[op_type]
+            if op_type in QDQRegistry:
+                del QDQRegistry[op_type]
 
     # Prepare TensorRT friendly quantization settings
     no_output_quantization_op_types = [
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index da7ff126d..a4f631f9f 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -73,6 +73,7 @@
     QDQ_PRECISION_MIN_OPSET,
     duplicate_shared_constants,
     get_opset_version,
+    get_quantized_nodes,
     name_onnx_nodes,
     save_onnx,
 )
@@ -242,6 +243,52 @@ def _preprocess_onnx(
     )
 
 
+def _find_nodes_to_quantize_autotune(
+    onnx_path: str,
+    onnx_model: onnx.ModelProto,
+    quantize_mode: str,
+    trt_plugins: list[str],
+    high_precision_dtype: str = "fp16",
+) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]]]:
+    logger.info("Running Auto Q/DQ with TensorRT")
+    from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
+    from modelopt.onnx.quantization.autotune.workflows import (
+        init_benchmark_instance,
+        region_pattern_autotuning_workflow,
+    )
+
+    # Initialize Autotuner with the Python 'tensorrt' package
+    init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins)
+    precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
+    autotuner = region_pattern_autotuning_workflow(
+        onnx_model,
+        quant_type=quantize_mode,
+        default_dq_dtype=precision_map[high_precision_dtype],
+    )
+
+    # Export model with Q/DQ insertion
+    onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx")
+    onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True)
+    # intermediate_generated_files.append(onnx_path_autotune)
+
+    # Get nodes and op types to quantize
+    onnx_model_autotune = onnx.load_from_string(onnx_bytes)
+    nodes_to_quantize_autotune = get_quantized_nodes(onnx_model_autotune)
+    nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune]
+    op_types_to_quantize = list(get_autotuner_quantizable_ops())
+
+    # Get non-quantizable tensors
+    # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name)
+    no_quantize_inputs = []
+    for node in nodes_to_quantize_autotune:
+        for idx, inp in enumerate(node.inputs):
+            if inp.inputs and inp.inputs[0].op != "DequantizeLinear":
+                src_node = node.i(idx)
+                no_quantize_inputs.append((src_node, node, inp.name))
+
+    return nodes_to_quantize_autotune_names, op_types_to_quantize, no_quantize_inputs
+
+
 def quantize(
     onnx_path: str,
     quantize_mode: str = "int8",
@@ -275,6 +322,7 @@ def quantize(
     input_shapes_profile: Sequence[dict[str, str]] | None = None,
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
     **kwargs: Any,
 ) -> None:
     """Quantizes the provided ONNX model.
@@ -398,6 +446,9 @@ def quantize(
             Target ONNX opset version for the quantized model. If None, uses required minimum opset
             (19 for int8/fp8, 21 for int4, 23 for nvfp4). If the specified opset is lower than the required minimum,
             a warning will be issued and the opset will be upgraded to the required minimum.
+        autotune:
+            If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available.
+            If False, use the default pattern-based quantization approach.
         kwargs:
             Additional keyword arguments for int4 quantization, including:
             - awqlite_alpha_step (float): Alpha step for lite, range [0, 1].
@@ -486,26 +537,40 @@ def quantize(
     # Check op types spelling in 'op_types_to_exclude' and '_to_quantize'
     validate_op_types_spelling(onnx_path, op_types_to_quantize, op_types_to_exclude)
 
-    # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern.
-    # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to
-    # MatMuls in MHA pattern.
-    # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8
-    # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern.
-    nodes_to_exclude = find_nodes_from_mha_to_exclude(
-        onnx_path,
-        use_external_data_format,
-        nodes_to_exclude,
-        disable_mha_qdq,
-        quantize_mode,
-        intermediate_generated_files,
-        calibration_data_reader,
-        calibration_eps,
-    )
+    if not autotune:
+        # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern.
+        # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to
+        # MatMuls in MHA pattern.
+        # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8
+        # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern.
+        nodes_to_exclude = find_nodes_from_mha_to_exclude(
+            onnx_path,
+            use_external_data_format,
+            nodes_to_exclude,
+            disable_mha_qdq,
+            quantize_mode,
+            intermediate_generated_files,
+            calibration_data_reader,
+            calibration_eps,
+        )
 
     if calibrate_per_node and not calibration_shapes:
         calibration_shapes = get_input_shapes(onnx_path)
 
     if quantize_mode in ["fp8", "int8"]:
+        no_quantize_inputs = []
+        if autotune:
+            nodes_to_quantize_autotune, op_types_to_quantize, no_quantize_inputs = (
+                _find_nodes_to_quantize_autotune(
+                    onnx_path,
+                    onnx_model,
+                    quantize_mode,
+                    trt_plugins,
+                    high_precision_dtype,
+                )
+            )
+            nodes_to_quantize.extend(nodes_to_quantize_autotune)
+
         quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8
         onnx_model = quantize_func(
             onnx_path=onnx_path,
@@ -531,8 +596,15 @@ def quantize(
             custom_ops_to_quantize=list(custom_ops_to_quantize.keys()),
             direct_io_types=direct_io_types,
             opset=opset,
+            autotune=autotune,
+            no_quantize_inputs=no_quantize_inputs,
             **kwargs,
         )
+
+        # if autotune:
+        #     # Copy real scales to quantized model
+        #     print()
+
     elif "int4" in quantize_mode:
         onnx_model = quantize_int4(
             onnx_path=onnx_path,
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
index 4025ea065..168af663b 100644
--- a/modelopt/onnx/utils.py
+++ b/modelopt/onnx/utils.py
@@ -172,6 +172,24 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto):
     return [inp for inp in graph.inputs if any(isinstance(s, str) or s <= 0 for s in inp.shape)]
 
 
+def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list:
+    """This function returns the nodes preceded by a DQ node.
+
+    Args:
+        onnx_model: ONNX model to traverse.
+
+    Returns:
+        List of quantized nodes.
+    """
+    graph = gs.import_onnx(onnx_model)
+
+    return [
+        node
+        for node in graph.nodes
+        if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs)
+    ]
+
+
 def _get_all_shapes(container: Any) -> dict[str, list[int]]:
     """This method returns the shape of tensors within a RepeatedCompositeContainer.
 

From 29e8dd20dcde0f5503c7ec6e4425b08a2d7c1d8d Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 23 Feb 2026 20:26:34 -0500
Subject: [PATCH 11/42] fix precommit failures

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/common.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index fe22e19c0..8717685c7 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -524,7 +524,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
                     distance = scheme.distance(existing_scheme)
                     if distance < self.minimum_distance:
                         # Schemes are too similar, keep the better one
-                        too_similar = True
                         if scheme.latency_ms < existing_scheme.latency_ms:
                             # New scheme is better; mark existing for removal
                             existing_to_remove = existing_scheme
@@ -541,10 +540,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
                     filtered_schemes.remove(existing_to_remove)
                 if not too_similar:
                     filtered_schemes.append(scheme)
-                elif schemes_to_replace:
-                    for scheme_to_replace in schemes_to_replace:
-                        filtered_schemes.remove(scheme_to_replace)
-                    filtered_schemes.append(scheme)
 
             sorted_schemes = filtered_schemes
 

From 7f69882cc9f7cb3cbd2118f7406d900f585c2672 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 23 Feb 2026 20:28:00 -0500
Subject: [PATCH 12/42] Fix: Add->Q/DQ->Activation(Relu)

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/op_types.py               | 22 +++++++++
 modelopt/onnx/quantization/fp8.py       |  5 +-
 modelopt/onnx/quantization/int8.py      |  5 +-
 modelopt/onnx/quantization/ort_utils.py | 62 +++++++++++++------------
 modelopt/onnx/quantization/quantize.py  | 62 ++++++++++++++++++-------
 modelopt/onnx/utils.py                  |  5 +-
 6 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/modelopt/onnx/op_types.py b/modelopt/onnx/op_types.py
index 7e11d25e6..42085e18f 100644
--- a/modelopt/onnx/op_types.py
+++ b/modelopt/onnx/op_types.py
@@ -386,3 +386,25 @@ def get_symmetric_ops():
         "BitwiseOr",
         "BitwiseXor",
     }
+
+
+def get_activation_ops():
+    """Returns set of activation operations."""
+    return {
+        "Relu",
+        "LeakyRelu",
+        "PRelu",
+        "Elu",
+        "Selu",
+        "ThresholdedRelu",
+        "Sigmoid",
+        "Tanh",
+        "HardSigmoid",
+        "Softmax",
+        "LogSoftmax",
+        "Clip",
+        "Softplus",
+        "Softsign",
+        "Swish",
+        "HardSwish",
+    }
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
index d8e0349ca..e181e1864 100755
--- a/modelopt/onnx/quantization/fp8.py
+++ b/modelopt/onnx/quantization/fp8.py
@@ -184,7 +184,6 @@ def quantize(
     direct_io_types: bool = False,
     opset: int | None = None,
     autotune: bool = False,
-    no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] | None = None,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies FP8 GEMM only quantization to an ONNX file.
@@ -247,7 +246,7 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
-        autotune,
+        kwargs.get("op_types_needing_output_quant"),
     )
     logger.info(
         f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}"
@@ -255,7 +254,7 @@ def quantize(
 
     # Collect node names to include in quantization
     nodes_to_quantize = nodes_to_quantize or []
-    no_quantize_inputs = no_quantize_inputs or []
+    no_quantize_inputs = kwargs.get("no_quantize_inputs", [])
     if not autotune:
         nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
         if not nodes_to_quantize:
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
index b1f781fb5..27c87abd4 100755
--- a/modelopt/onnx/quantization/int8.py
+++ b/modelopt/onnx/quantization/int8.py
@@ -134,7 +134,6 @@ def quantize(
     direct_io_types: bool = False,
     opset: int | None = None,
     autotune: bool = False,
-    no_quantize_inputs: list[tuple[Node, Node, str]] | None = None,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics.
@@ -192,13 +191,13 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
-        autotune,
+        kwargs.get("op_types_needing_output_quant"),
     )
     logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}")
 
     # Collect node names to include in quantization
     nodes_to_quantize = nodes_to_quantize or []
-    no_quantize_inputs = no_quantize_inputs or []
+    no_quantize_inputs = kwargs.get("no_quantize_inputs", [])
     if not autotune:
         nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
         if not nodes_to_quantize:
diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py
index 089c8850f..173fbb06d 100755
--- a/modelopt/onnx/quantization/ort_utils.py
+++ b/modelopt/onnx/quantization/ort_utils.py
@@ -271,7 +271,7 @@ def configure_ort(
     calibration_eps: list[str] | None = None,
     calibrate_per_node: bool = False,
     custom_ops_to_quantize: list[str] = [],
-    autotune: bool = False,
+    op_types_needing_output_quant: list[str] | None = None,
 ):
     """Configure and patches ORT to support ModelOpt ONNX quantization."""
     logger.info("Configuring ORT for ModelOpt ONNX quantization")
@@ -290,38 +290,40 @@ def configure_ort(
     # Patch ORT modules to fix bugs and support some edge cases
     patch_ort_modules(calibrate_per_node)
 
-    if not autotune:
-        # Remove copy, reduction and activation ops from ORT QDQ registry
-        logger.debug("Removing non-quantizable ops from QDQ registry")
-        for op_type in [
-            "ArgMax",
-            "Concat",
-            "EmbedLayerNormalization",
-            "Gather",
-            "GatherElements",
-            "GatherND",
-            "InstanceNormalization",
-            "LeakyRelu",
-            "Pad",
-            "Relu",
-            "Reshape",
-            "Slice",
-            "Sigmoid",
-            "Softmax",
-            "Split",
-            "Squeeze",
-            "Transpose",
-            "Unsqueeze",
-            "Where",
-        ]:
-            if op_type in QLinearOpsRegistry:
-                del QLinearOpsRegistry[op_type]
-            if op_type in QDQRegistry:
-                del QDQRegistry[op_type]
+    # Remove copy, reduction and activation ops from ORT QDQ registry
+    logger.debug("Removing non-quantizable ops from QDQ registry")
+    for op_type in {
+        "ArgMax",
+        "Concat",
+        "EmbedLayerNormalization",
+        "Gather",
+        "GatherElements",
+        "GatherND",
+        "InstanceNormalization",
+        "LeakyRelu",
+        "Pad",
+        "Relu",
+        "Reshape",
+        "Slice",
+        "Sigmoid",
+        "Softmax",
+        "Split",
+        "Squeeze",
+        "Transpose",
+        "Unsqueeze",
+        "Where",
+    } - set(op_types_to_quantize):
+        if op_type in QLinearOpsRegistry:
+            del QLinearOpsRegistry[op_type]
+        if op_type in QDQRegistry:
+            del QDQRegistry[op_type]
 
     # Prepare TensorRT friendly quantization settings
     no_output_quantization_op_types = [
-        op_type for op_type in op_types if op_type not in custom_ops_to_quantize
+        op_type
+        for op_type in op_types
+        if op_type not in custom_ops_to_quantize
+        and op_type not in (op_types_needing_output_quant or [])
     ]
     if trt_extra_plugin_lib_paths is not None:
         trt_extra_plugin_lib_paths = ";".join(trt_extra_plugin_lib_paths)
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index a4f631f9f..743b5e368 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -44,7 +44,7 @@
 import onnxslim
 
 from modelopt.onnx.logging_config import configure_logging, logger
-from modelopt.onnx.op_types import is_data_dependent_shape_op
+from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -249,7 +249,7 @@ def _find_nodes_to_quantize_autotune(
     quantize_mode: str,
     trt_plugins: list[str],
     high_precision_dtype: str = "fp16",
-) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]]]:
+) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
     from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
     from modelopt.onnx.quantization.autotune.workflows import (
@@ -277,16 +277,43 @@ def _find_nodes_to_quantize_autotune(
     nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune]
     op_types_to_quantize = list(get_autotuner_quantizable_ops())
 
-    # Get non-quantizable tensors
+    # Get non-quantizable tensors and identify op types whose outputs are quantized.
     # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name)
     no_quantize_inputs = []
+    # List of ops to enable output quantization.
+    #   By default, all ONNX standard ops have output quantization disabled due to TensorRT's quantization recipe
+    #   (inputs and weights only). However, this causes QDQRemovableActivation (used for Relu, Sigmoid, etc.) to exit
+    #   early when it checks is_tensor_quantized() on its input, producing no Q/DQ between e.g. Add and Relu. This list
+    #   will be used in configure_ort() to enable output quantization of the ops included in it.
+    op_types_needing_output_quant = set()
     for node in nodes_to_quantize_autotune:
         for idx, inp in enumerate(node.inputs):
             if inp.inputs and inp.inputs[0].op != "DequantizeLinear":
                 src_node = node.i(idx)
                 no_quantize_inputs.append((src_node, node, inp.name))
+            elif (
+                inp.inputs
+                and inp.inputs[0].op == "DequantizeLinear"
+                and node.op in get_activation_ops()
+            ):
+                # Trace back through DQ→Q to find the node whose output is being quantized.
+                # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer
+                dq_node = inp.inputs[0]
+                quantized_tensor = dq_node.inputs[0]  # Q's output (= DQ's input)
+                if quantized_tensor.inputs:
+                    q_node = quantized_tensor.inputs[0]  # QuantizeLinear node
+                    if q_node.op == "QuantizeLinear" and q_node.inputs:
+                        original_tensor = q_node.inputs[0]  # e.g. Add_output_0
+                        if original_tensor.inputs:
+                            producer = original_tensor.inputs[0]  # e.g. Add
+                            op_types_needing_output_quant.add(producer.op)
 
-    return nodes_to_quantize_autotune_names, op_types_to_quantize, no_quantize_inputs
+    return (
+        nodes_to_quantize_autotune_names,
+        op_types_to_quantize,
+        no_quantize_inputs,
+        list(op_types_needing_output_quant),
+    )
 
 
 def quantize(
@@ -558,18 +585,22 @@ def quantize(
         calibration_shapes = get_input_shapes(onnx_path)
 
     if quantize_mode in ["fp8", "int8"]:
-        no_quantize_inputs = []
         if autotune:
-            nodes_to_quantize_autotune, op_types_to_quantize, no_quantize_inputs = (
-                _find_nodes_to_quantize_autotune(
-                    onnx_path,
-                    onnx_model,
-                    quantize_mode,
-                    trt_plugins,
-                    high_precision_dtype,
-                )
+            (
+                nodes_to_quantize_autotune,
+                op_types_to_quantize,
+                no_quantize_inputs,
+                op_types_needing_output_quant,
+            ) = _find_nodes_to_quantize_autotune(
+                onnx_path,
+                onnx_model,
+                quantize_mode,
+                trt_plugins,
+                high_precision_dtype,
             )
             nodes_to_quantize.extend(nodes_to_quantize_autotune)
+            kwargs["no_quantize_inputs"] = no_quantize_inputs
+            kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant
 
         quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8
         onnx_model = quantize_func(
@@ -597,14 +628,9 @@ def quantize(
             direct_io_types=direct_io_types,
             opset=opset,
             autotune=autotune,
-            no_quantize_inputs=no_quantize_inputs,
             **kwargs,
         )
 
-        # if autotune:
-        #     # Copy real scales to quantized model
-        #     print()
-
     elif "int4" in quantize_mode:
         onnx_model = quantize_int4(
             onnx_path=onnx_path,
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
index 168af663b..e1e9715f1 100644
--- a/modelopt/onnx/utils.py
+++ b/modelopt/onnx/utils.py
@@ -173,13 +173,13 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto):
 
 
 def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list:
-    """This function returns the nodes preceded by a DQ node.
+    """This function returns the nodes preceded by a DQ node or followed by a Q node.
 
     Args:
         onnx_model: ONNX model to traverse.
 
     Returns:
-        List of quantized nodes.
+        List of quantized nodes (input or output).
     """
     graph = gs.import_onnx(onnx_model)
 
@@ -187,6 +187,7 @@ def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list:
         node
         for node in graph.nodes
         if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs)
+        or any(out.outputs[0].op == "QuantizeLinear" for out in node.outputs if out.outputs)
     ]
 
 

From bb030bec6ef7d1d9ec84df0752df9f18273f284c Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 23 Feb 2026 20:29:14 -0500
Subject: [PATCH 13/42] Fix: correctly dequantize Add input with shared Q/DQ

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/graph_utils.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index efa77dd7b..9ef88d4a9 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -616,16 +616,27 @@ def remove_partial_input_qdq(
             # Reached end of the graph
             continue
         if dq_node.op == "DequantizeLinear":
-            dq_node = dq_node.outputs[0]  # source_node->Q->DQ->target_node0
+            dq_output = dq_node.outputs[0]  # source_node->Q->DQ->target_node
+
+            # Look up the specific target node in the quantized graph.
+            # With DedicatedQDQPair=False, a shared Q/DQ pair may feed multiple consumers
+            # (e.g. Conv activation AND Add residual). Always patch the intended target
+            # rather than the first consumer of the DQ output to avoid removing Q/DQ from
+            # the wrong branch.
+            target_node_in_graph = graph_nodes.get(target.name)
+            if target_node_in_graph is None:
+                continue
 
-            # Find the input index in the target connecting with source_node
+            # Find the input index in the target that is connected to the DQ output
             target_input_idx_arr = [
-                idx for idx, inp in enumerate(dq_node.outputs[0].inputs) if inp.name == dq_node.name
+                idx
+                for idx, inp in enumerate(target_node_in_graph.inputs)
+                if inp.name == dq_output.name
             ]
             target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0
 
-            # Connect the output of source_node with the output of DQ
-            dq_node.outputs[0].inputs[target_input_idx] = source_node.outputs[0]
+            # Connect the target's input directly to source_node's output (bypass Q/DQ)
+            target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0]
 
     # Check for quantized residual Adds where the parallel branch is not being quantized
     for source, target, non_qdq_input_name in no_quantize_inputs:

From 616227c944e48a94fd45f15929c0a1346f89b596 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:21:47 -0500
Subject: [PATCH 14/42] [5916893] Fix weighted ops quantization logic: both
 input and weights Q/DQ need to be added or removed

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/autotuner.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/modelopt/onnx/quantization/autotune/autotuner.py b/modelopt/onnx/quantization/autotune/autotuner.py
index 69038c59a..7afc50559 100644
--- a/modelopt/onnx/quantization/autotune/autotuner.py
+++ b/modelopt/onnx/quantization/autotune/autotuner.py
@@ -22,6 +22,19 @@
 from modelopt.onnx.quantization.autotune.common import Config, PatternCache, Region, RegionType
 from modelopt.onnx.quantization.autotune.region_search import CombinedRegionSearch
 
+_MUTATION_SPECS = [
+    ("node_inputs", "node input points", lambda p: (p.node_index, p.input_index)),
+    (
+        "child_region_inputs",
+        "region composite points",
+        lambda p: (p.region_index, p.input_index),
+    ),
+    (
+        "region_outputs",
+        "region output points",
+        lambda p: (p.region_index, p.node_index, p.output_index),
+    ),
+]
 
 class QDQAutotuner(QDQAutotunerBase):
     """Q/DQ autotuner with automatic region discovery around compute-intensive ops."""

From afee0a4a4504dd65a7f99cde15906ed77d2703b1 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:29:26 -0500
Subject: [PATCH 15/42] Changed keep_output_dir to True as default

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/common.py    | 3 ---
 modelopt/onnx/quantization/autotune/workflows.py | 2 +-
 modelopt/onnx/quantization/quantize.py           | 3 ++-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py
index 8717685c7..d3b3de272 100644
--- a/modelopt/onnx/quantization/autotune/common.py
+++ b/modelopt/onnx/quantization/autotune/common.py
@@ -531,9 +531,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None:
                         else:
                             # Existing scheme is better, skip new one
                             too_similar = True
-                            if scheme.latency_ms < existing_scheme.latency_ms:
-                                # New scheme is better, mark existing for replacement
-                                schemes_to_replace.append(existing_scheme)
                             break
 
                 if existing_to_remove is not None:
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index 57ad73015..a8ba279e0 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -170,7 +170,7 @@ def region_pattern_autotuning_workflow(
     qdq_baseline_model: str | None = None,
     node_filter_list: list[str] | None = None,
     verbose: bool = False,
-    keep_output_dir: bool = False,
+    keep_output_dir: bool = True,
 ) -> QDQAutotuner:
     """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model.
 
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 743b5e368..64847057a 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -264,6 +264,7 @@ def _find_nodes_to_quantize_autotune(
         onnx_model,
         quant_type=quantize_mode,
         default_dq_dtype=precision_map[high_precision_dtype],
+        keep_output_dir=False,
     )
 
     # Export model with Q/DQ insertion
@@ -296,7 +297,7 @@ def _find_nodes_to_quantize_autotune(
                 and inp.inputs[0].op == "DequantizeLinear"
                 and node.op in get_activation_ops()
             ):
-                # Trace back through DQ→Q to find the node whose output is being quantized.
+                # Trace back through DQ → Q to find the node whose output is being quantized.
                 # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer
                 dq_node = inp.inputs[0]
                 quantized_tensor = dq_node.inputs[0]  # Q's output (= DQ's input)

From faf0bbbf5f15f0e6f806b45f80c1ebb9e31bd782 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:46:25 -0500
Subject: [PATCH 16/42] test_workflow was moved to 'tests/gpu/onnx'

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../quantization/autotune/test_workflows.py   | 80 -------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 tests/unit/onnx/quantization/autotune/test_workflows.py

diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py
deleted file mode 100644
index c8edafc06..000000000
--- a/tests/unit/onnx/quantization/autotune/test_workflows.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import tempfile
-from pathlib import Path
-
-import onnx
-import pytest
-
-# Add parent directory to path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec
-from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model
-
-from modelopt.onnx.quantization.autotune.workflows import (
-    init_benchmark_instance,
-    region_pattern_autotuning_workflow,
-)
-
-
-@pytest.mark.parametrize("use_trtexec", [True, False])
-def test_export_quantized_model(use_trtexec):
-    """Test exporting quantized model with Q/DQ."""
-    if use_trtexec:
-        skip_if_no_trtexec()
-    else:
-        skip_if_no_tensorrt()
-
-    model = create_simple_conv_model()
-
-    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
-        baseline_model_path = f.name
-
-    # Save baseline model
-    onnx.save(model, baseline_model_path)
-
-    output_dir = baseline_model_path.strip(".onnx")
-    output_path = output_dir + ".quant.onnx"
-
-    try:
-        init_benchmark_instance(use_trtexec=use_trtexec)
-        autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir))
-
-        # Export model with Q/DQ insertion
-        autotuner.export_onnx(output_path, insert_qdq=True)
-
-        # Verify file was created
-        assert os.path.exists(output_path)
-
-        # Verify it's a valid ONNX model
-        exported_model = onnx.load(output_path)
-        assert exported_model is not None
-
-        # Verify that it contains Q/DQ nodes
-        qdq_nodes = [
-            n
-            for n in exported_model.graph.node
-            if n.op_type in ["QuantizeLinear", "DequantizeLinear"]
-        ]
-        assert qdq_nodes, "Q/DQ nodes not found in quantized model"
-
-        print("✓ QDQAutotuner export quantized model")
-    finally:
-        if os.path.exists(output_path):
-            os.unlink(output_path)

From 08bf713497fb4c02646a820e7323147cc321d5c2 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:56:50 -0500
Subject: [PATCH 17/42] Removed cli.py, moved into __main__.py

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/cli.py | 294 ---------------------
 1 file changed, 294 deletions(-)
 delete mode 100644 modelopt/onnx/quantization/autotune/cli.py

diff --git a/modelopt/onnx/quantization/autotune/cli.py b/modelopt/onnx/quantization/autotune/cli.py
deleted file mode 100644
index a5809f9a5..000000000
--- a/modelopt/onnx/quantization/autotune/cli.py
+++ /dev/null
@@ -1,294 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""CLI argument parsing and execution for ONNX Q/DQ autotuning.
-
-This module provides `run_autotune` which handles both argument parsing and
-workflow execution. See `__main__.py` for usage examples.
-"""
-
-import argparse
-import sys
-from pathlib import Path
-
-from modelopt.onnx.logging_config import logger
-from modelopt.onnx.quantization.autotune.workflows import (
-    init_benchmark_instance,
-    region_pattern_autotuning_workflow,
-)
-
-DEFAULT_OUTPUT_DIR = "./autotuner_output"
-DEFAULT_NUM_SCHEMES = 30
-DEFAULT_QUANT_TYPE = "int8"
-DEFAULT_DQ_DTYPE = "float32"
-DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache"  # nosec B108
-DEFAULT_WARMUP_RUNS = 5
-DEFAULT_TIMING_RUNS = 20
-
-
-def validate_file_path(path: str | None, description: str) -> Path | None:
-    """Validate that a file path exists.
-
-    Args:
-        path: Path string to validate (can be None)
-        description: Description of the file for error messages
-
-    Returns:
-        Path object if valid, None if path is None
-
-    Raises:
-        SystemExit: If path is provided but doesn't exist
-    """
-    if path is None:
-        return None
-
-    path_obj = Path(path)
-    if not path_obj.exists():
-        logger.error(f"{description} not found: {path_obj}")
-        sys.exit(1)
-
-    return path_obj
-
-
-def log_benchmark_config(args):
-    """Log TensorRT benchmark configuration for transparency.
-
-    Logs timing cache path, warmup/timing run counts, and any custom
-    plugin libraries that will be loaded.
-
-    Args:
-        args: Parsed command-line arguments with benchmark configuration
-    """
-    logger.info("Initializing TensorRT benchmark")
-    logger.info(f"  Timing cache: {args.timing_cache}")
-    logger.info(f"  Warmup runs: {args.warmup_runs}")
-    logger.info(f"  Timing runs: {args.timing_runs}")
-    if args.plugin_libraries:
-        logger.info(f"  Plugin libraries: {', '.join(args.plugin_libraries)}")
-
-
-def run_autotune(args=None) -> int:
-    """Execute the complete pattern-based Q/DQ autotuning workflow.
-
-    This function orchestrates the entire optimization process:
-    1. Parses command-line arguments (if not provided)
-    2. Validates input paths (model, baseline, output directory)
-    3. Initializes TensorRT benchmark instance
-    4. Runs pattern-based region autotuning workflow
-    5. Handles interruptions gracefully with state preservation
-
-    Args:
-        args: Optional parsed command-line arguments. If None, parses sys.argv.
-
-    Returns:
-        Exit code:
-        - 0: Success
-        - 1: Autotuning failed (exception occurred)
-        - 130: Interrupted by user (Ctrl+C)
-    """
-    if args is None:
-        args = _get_autotune_parser().parse_args()
-
-    model_path = validate_file_path(args.onnx_path, "Model file")
-    validate_file_path(args.qdq_baseline, "QDQ baseline model")
-    output_dir = Path(args.output)
-
-    log_benchmark_config(args)
-    init_benchmark_instance(
-        use_trtexec=args.use_trtexec,
-        plugin_libraries=args.plugin_libraries,
-        timing_cache_file=args.timing_cache,
-        warmup_runs=args.warmup_runs,
-        timing_runs=args.timing_runs,
-    )
-
-    logger.info("Autotuning Mode: Pattern-Based")
-
-    try:
-        node_filter_list = None
-        if args.node_filter_list:
-            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
-            if filter_file:
-                with open(filter_file) as f:
-                    node_filter_list = [
-                        line.strip()
-                        for line in f
-                        if line.strip() and not line.strip().startswith("#")
-                    ]
-                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
-
-        region_pattern_autotuning_workflow(
-            model_path=str(model_path),
-            output_dir=output_dir,
-            num_schemes_per_region=args.num_schemes,
-            pattern_cache_file=args.pattern_cache_file,
-            state_file=args.state_file,
-            quant_type=args.quant_type,
-            default_dq_dtype=args.default_dq_dtype,
-            qdq_baseline_model=args.qdq_baseline,
-            node_filter_list=node_filter_list,
-        )
-
-        logger.info("\n" + "=" * 70)
-        logger.info("✓ Autotuning completed successfully!")
-        logger.info(f"✓ Results: {output_dir}")
-        logger.info("=" * 70)
-        return 0
-
-    except KeyboardInterrupt:
-        logger.warning("\nInterrupted by user")
-        state_file = args.state_file or output_dir / "autotuner_state.yaml"
-        logger.info(f"Progress saved to: {state_file}")
-        return 130
-
-    except Exception as e:
-        logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose)
-        return 1
-
-
-def _get_autotune_parser() -> argparse.ArgumentParser:
-    """Create and configure the command-line argument parser."""
-    parser = argparse.ArgumentParser(
-        prog="modelopt.onnx.quantization.autotune",
-        description="ONNX Q/DQ Autotuning with TensorRT",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Basic usage
-  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx
-
-  # Import patterns from QDQ baseline model
-  python -m modelopt.onnx.quantization.autotune \\
-      --onnx_path model.onnx --qdq_baseline baseline.onnx
-
-  # Use pattern cache for warm-start
-  python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml
-
-  # Full example with all options
-  python -m modelopt.onnx.quantization.autotune \\
-      --onnx_path model.onnx --schemes_per_region 50 \\
-      --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\
-      --quant_type int8 --verbose
-        """,
-    )
-
-    # Model and Output
-    io_group = parser.add_argument_group("Model and Output")
-    io_group.add_argument(
-        "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file"
-    )
-    io_group.add_argument(
-        "--output",
-        "-o",
-        type=str,
-        default=DEFAULT_OUTPUT_DIR,
-        help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})",
-    )
-
-    # Autotuning Strategy
-    strategy_group = parser.add_argument_group("Autotuning Strategy")
-    strategy_group.add_argument(
-        "--schemes_per_region",
-        "-s",
-        type=int,
-        default=DEFAULT_NUM_SCHEMES,
-        dest="num_schemes",
-        help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})",
-    )
-    strategy_group.add_argument(
-        "--pattern_cache",
-        type=str,
-        default=None,
-        dest="pattern_cache_file",
-        help="Path to pattern cache YAML for warm-start (optional)",
-    )
-    strategy_group.add_argument(
-        "--qdq_baseline",
-        type=str,
-        default=None,
-        help="Path to QDQ baseline ONNX model to import quantization patterns (optional)",
-    )
-    strategy_group.add_argument(
-        "--state_file",
-        type=str,
-        default=None,
-        help="State file path for resume capability (default: <output>/autotuner_state.yaml)",
-    )
-    strategy_group.add_argument(
-        "--node_filter_list",
-        type=str,
-        default=None,
-        help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
-        "Regions without any matching nodes are skipped during autotuning.",
-    )
-
-    # Quantization
-    quant_group = parser.add_argument_group("Quantization")
-    quant_group.add_argument(
-        "--quant_type",
-        type=str,
-        default=DEFAULT_QUANT_TYPE,
-        choices=["int8", "fp8"],
-        help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})",
-    )
-    quant_group.add_argument(
-        "--default_dq_dtype",
-        type=str,
-        default=DEFAULT_DQ_DTYPE,
-        choices=["float16", "float32", "bfloat16"],
-        help="Default DQ output dtype if cannot be deduced (optional)",
-    )
-
-    # TensorRT Benchmark
-    trt_group = parser.add_argument_group("TensorRT Benchmark")
-    trt_group.add_argument(
-        "--use_trtexec",
-        action="store_true",
-        help="Use trtexec for benchmarking (default: False)",
-        default=False,
-    )
-    trt_group.add_argument(
-        "--timing_cache",
-        type=str,
-        default=DEFAULT_TIMING_CACHE,
-        help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})",
-    )
-    trt_group.add_argument(
-        "--warmup_runs",
-        type=int,
-        default=DEFAULT_WARMUP_RUNS,
-        help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})",
-    )
-    trt_group.add_argument(
-        "--timing_runs",
-        type=int,
-        default=DEFAULT_TIMING_RUNS,
-        help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})",
-    )
-    trt_group.add_argument(
-        "--plugin_libraries",
-        "--plugins",
-        type=str,
-        nargs="+",
-        default=None,
-        dest="plugin_libraries",
-        help="TensorRT plugin libraries (.so files) to load (optional, space-separated)",
-    )
-
-    # Logging
-    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging")
-
-    return parser

From 81fce4884f91a298cd5d7e6cfbb6c1dcf4afb67a Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:57:24 -0500
Subject: [PATCH 18/42] Removed PatternSchemes import from region_pattern.py:
 no longer needed.

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/region_pattern.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/region_pattern.py b/modelopt/onnx/quantization/autotune/region_pattern.py
index 9f80bd56e..a32273f84 100644
--- a/modelopt/onnx/quantization/autotune/region_pattern.py
+++ b/modelopt/onnx/quantization/autotune/region_pattern.py
@@ -21,7 +21,7 @@
 import onnx_graphsurgeon as gs
 
 from modelopt.onnx.op_types import get_symmetric_ops
-from modelopt.onnx.quantization.autotune.common import InsertionScheme, PatternSchemes, Region
+from modelopt.onnx.quantization.autotune.common import InsertionScheme, Region
 from modelopt.onnx.quantization.autotune.insertion_points import (
     ChildRegionInputInsertionPoint,
     ChildRegionOutputInsertionPoint,
@@ -161,9 +161,6 @@ def matches(
                        is provided but other is not a Region
             TypeError: If other is neither RegionPattern nor Region
         """
-        if isinstance(scheme, PatternSchemes):
-            return set()
-
         if isinstance(other, RegionPattern):
             if scheme is not None:
                 raise ValueError("scheme parameter can only be used when matching against a Region")

From 7a57b8d47e32c8850747f3e8c7eb9d5bccfaeeaf Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:57:58 -0500
Subject: [PATCH 19/42] Added intermediate Autotune model to be removed at the
 end of the quantization workflow

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 64847057a..5e872e24b 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -249,6 +249,7 @@ def _find_nodes_to_quantize_autotune(
     quantize_mode: str,
     trt_plugins: list[str],
     high_precision_dtype: str = "fp16",
+    intermediate_generated_files: list[str] = [],
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
     from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
@@ -270,7 +271,7 @@ def _find_nodes_to_quantize_autotune(
     # Export model with Q/DQ insertion
     onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx")
     onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True)
-    # intermediate_generated_files.append(onnx_path_autotune)
+    intermediate_generated_files.append(onnx_path_autotune)
 
     # Get nodes and op types to quantize
     onnx_model_autotune = onnx.load_from_string(onnx_bytes)
@@ -598,6 +599,7 @@ def quantize(
                 quantize_mode,
                 trt_plugins,
                 high_precision_dtype,
+                intermediate_generated_files,
             )
             nodes_to_quantize.extend(nodes_to_quantize_autotune)
             kwargs["no_quantize_inputs"] = no_quantize_inputs

From a71fc914a2b9f6a88247466c453565608462416c Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 12:59:34 -0500
Subject: [PATCH 20/42] Removed _MUTATION_SPECS from autotuner.py: moved to
 autotuner_base.py

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/autotuner.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/autotuner.py b/modelopt/onnx/quantization/autotune/autotuner.py
index 7afc50559..69038c59a 100644
--- a/modelopt/onnx/quantization/autotune/autotuner.py
+++ b/modelopt/onnx/quantization/autotune/autotuner.py
@@ -22,19 +22,6 @@
 from modelopt.onnx.quantization.autotune.common import Config, PatternCache, Region, RegionType
 from modelopt.onnx.quantization.autotune.region_search import CombinedRegionSearch
 
-_MUTATION_SPECS = [
-    ("node_inputs", "node input points", lambda p: (p.node_index, p.input_index)),
-    (
-        "child_region_inputs",
-        "region composite points",
-        lambda p: (p.region_index, p.input_index),
-    ),
-    (
-        "region_outputs",
-        "region output points",
-        lambda p: (p.region_index, p.node_index, p.output_index),
-    ),
-]
 
 class QDQAutotuner(QDQAutotunerBase):
     """Q/DQ autotuner with automatic region discovery around compute-intensive ops."""

From 01e8be0982b27376b9ec43791f37f4f26f2fa584 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 13:02:50 -0500
Subject: [PATCH 21/42] Removed test_config and test_pattern_cache. Should be
 added in the original Auto Q/DQ PR.

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../onnx/quantization/autotune/test_config.py | 143 ------------------
 .../onnx/quantization/autotune/test_region.py |   4 +-
 2 files changed, 3 insertions(+), 144 deletions(-)
 delete mode 100644 tests/unit/onnx/quantization/autotune/test_config.py

diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py
deleted file mode 100644
index c5b20a8a9..000000000
--- a/tests/unit/onnx/quantization/autotune/test_config.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Tests for the Config class in the autotuner.
-
-Tests configuration parameter validation and defaults.
-"""
-
-import os
-import sys
-import unittest
-
-# Add parent directory to path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from modelopt.onnx.quantization.autotune.common import Config
-
-
-class TestConfig(unittest.TestCase):
-    """Test Config class functionality."""
-
-    def test_default_values(self):
-        """Test that Config has correct default values."""
-        config = Config()
-
-        # Logging
-        assert not config.verbose
-
-        # Performance thresholds
-
-        # Q/DQ defaults
-        assert config.default_q_scale == 0.1
-        assert config.default_q_zero_point == 0
-        assert config.default_quant_type == "int8"
-
-        # Region builder settings
-        assert config.maximum_sequence_region_size == 10
-        assert config.minimum_topdown_search_size == 10
-
-        # Scheme generation parameters
-        assert config.top_percent_to_mutate == 0.1
-        assert config.minimum_schemes_to_mutate == 10
-        assert config.maximum_mutations == 3
-        assert config.maximum_generation_attempts == 100
-
-        # Pattern cache parameters
-        assert config.pattern_cache_minimum_distance == 4
-        assert config.pattern_cache_max_entries_per_pattern == 32
-
-        print("✓ Config default values are correct")
-
-    def test_custom_values(self):
-        """Test creating Config with custom values."""
-        config = Config(
-            verbose=True,
-            default_q_scale=0.05,
-            default_q_zero_point=128,
-            default_quant_type="fp8",
-            maximum_sequence_region_size=20,
-        )
-
-        assert config.verbose
-        assert config.default_q_scale == 0.05
-        assert config.default_q_zero_point == 128
-        assert config.default_quant_type == "fp8"
-        assert config.maximum_sequence_region_size == 20
-        print("✓ Config custom values work correctly")
-
-    def test_region_size_validation(self):
-        """Test that region size parameters are positive."""
-        config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5)
-        assert config.maximum_sequence_region_size > 0
-        assert config.minimum_topdown_search_size > 0
-        print("✓ Config region size validation")
-
-    def test_genetic_algorithm_params(self):
-        """Test genetic algorithm parameters."""
-        config = Config(
-            top_percent_to_mutate=0.2,
-            minimum_schemes_to_mutate=2,
-            maximum_mutations=5,
-            maximum_generation_attempts=50,
-        )
-
-        assert config.top_percent_to_mutate == 0.2
-        assert config.minimum_schemes_to_mutate == 2
-        assert config.maximum_mutations == 5
-        assert config.maximum_generation_attempts == 50
-        print("✓ Config genetic algorithm parameters")
-
-    def test_pattern_cache_params(self):
-        """Test pattern cache parameters."""
-        config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10)
-
-        assert config.pattern_cache_minimum_distance == 3
-        assert config.pattern_cache_max_entries_per_pattern == 10
-        print("✓ Config pattern cache parameters")
-
-
-def run_tests():
-    """Run all Config tests."""
-    print("=" * 70)
-    print("Config Class Test Suite")
-    print("=" * 70)
-
-    loader = unittest.TestLoader()
-    suite = unittest.TestSuite()
-    suite.addTests(loader.loadTestsFromTestCase(TestConfig))
-
-    runner = unittest.TextTestRunner(verbosity=2)
-    result = runner.run(suite)
-
-    print("\n" + "=" * 70)
-    print("Test Summary")
-    print("=" * 70)
-    print(f"Tests run: {result.testsRun}")
-    print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}")
-    print(f"Failures: {len(result.failures)}")
-    print(f"Errors: {len(result.errors)}")
-
-    if result.wasSuccessful():
-        print("\n✓ All Config tests passed!")
-        return 0
-    else:
-        print("\n✗ Some tests failed")
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(run_tests())
diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py
index 3bbf34ac9..5a733017d 100644
--- a/tests/unit/onnx/quantization/autotune/test_region.py
+++ b/tests/unit/onnx/quantization/autotune/test_region.py
@@ -59,6 +59,7 @@ def test_parent_child_relationship(parent_with_children):
     assert parent.get_children() == [child1, child2]
     assert child1.parent == child2.parent == parent
 
+
 def test_add_and_get_nodes(leaf):
     leaf.nodes.update([0, 1, 2])
     assert set(leaf.get_nodes()) == {0, 1, 2}
@@ -78,6 +79,7 @@ def test_region_size_recursive(parent_with_children):
     parent.nodes.add(5)
     assert len(parent.get_region_nodes_and_descendants()) == 6
 
+
 def test_metadata(leaf):
     leaf.metadata.update({"pattern": "Conv->Relu", "quantizable": "true"})
     assert leaf.metadata == {"pattern": "Conv->Relu", "quantizable": "true"}
@@ -107,4 +109,4 @@ def test_remove_child():
     parent.add_child(child)
     parent.remove_child(child)
     assert parent.get_children() == []
-    assert child.parent is None
\ No newline at end of file
+    assert child.parent is None

From ad7a60da0f437f06c1976d589f444b6f47e33706 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 15:26:53 -0500
Subject: [PATCH 22/42] Fixed minor coderabbit suggestions

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/workflows.py | 2 +-
 modelopt/onnx/quantization/quantize.py           | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index a8ba279e0..6dd84d4c1 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -385,7 +385,7 @@ def region_pattern_autotuning_workflow(
 
     if not keep_output_dir:
         logger.debug(
-            f"Removing output dir: {output_dir}. Select 'keep_output_dir=False' if you wish to keep it."
+            f"Removing output dir: {output_dir}. Set 'keep_output_dir=True' if you wish to keep it."
         )
         shutil.rmtree(output_dir)
 
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 5e872e24b..641dfefda 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -249,7 +249,7 @@ def _find_nodes_to_quantize_autotune(
     quantize_mode: str,
     trt_plugins: list[str],
     high_precision_dtype: str = "fp16",
-    intermediate_generated_files: list[str] = [],
+    intermediate_generated_files: list[str] | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
     from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
@@ -258,6 +258,9 @@ def _find_nodes_to_quantize_autotune(
         region_pattern_autotuning_workflow,
     )
 
+    if intermediate_generated_files is None:
+        intermediate_generated_files = []
+
     # Initialize Autotuner with the Python 'tensorrt' package
     init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins)
     precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}

From 7589668544f4c75632dc37e4a899233f16fbc832 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 2 Mar 2026 15:29:05 -0500
Subject: [PATCH 23/42] Moved autotune imports to the top of the file

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 641dfefda..184483dec 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -45,6 +45,11 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op
+from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -252,12 +257,6 @@ def _find_nodes_to_quantize_autotune(
     intermediate_generated_files: list[str] | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
-    from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
-    from modelopt.onnx.quantization.autotune.workflows import (
-        init_benchmark_instance,
-        region_pattern_autotuning_workflow,
-    )
-
     if intermediate_generated_files is None:
         intermediate_generated_files = []
 

From db4c3effd937a3648eb4f73dd26f7cc1a1b6af7c Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 10:02:52 -0500
Subject: [PATCH 24/42] Eliminate intermediate ONNX export in
 _find_nodes_to_quantize_autotune(). Directly use Insertion Points
 information.

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../quantization/autotune/autotuner_base.py   | 136 ++++++++++++++----
 modelopt/onnx/quantization/quantize.py        |  62 +-------
 2 files changed, 115 insertions(+), 83 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py
index a519d7c61..e1ea3c29e 100644
--- a/modelopt/onnx/quantization/autotune/autotuner_base.py
+++ b/modelopt/onnx/quantization/autotune/autotuner_base.py
@@ -35,7 +35,7 @@
 import yaml
 
 from modelopt.onnx.logging_config import logger
-from modelopt.onnx.op_types import is_linear_op
+from modelopt.onnx.op_types import get_activation_ops, is_linear_op
 from modelopt.onnx.quantization.autotune.common import (
     AutotunerNotInitializedError,
     Config,
@@ -46,7 +46,10 @@
     Region,
 )
 from modelopt.onnx.quantization.autotune.export_utils import export_qdq_onnx
-from modelopt.onnx.quantization.autotune.insertion_points import ResolvedInsertionPoint
+from modelopt.onnx.quantization.autotune.insertion_points import (
+    ResolvedInsertionPoint,
+    get_autotuner_quantizable_ops,
+)
 from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern
 from modelopt.onnx.quantization.graph_utils import get_tensor_consumer_node_indices
 
@@ -434,6 +437,111 @@ def _exclude_overlapping_insertion_points(
         if all_region_ips:
             logger.debug(f"  → Excluded {len(all_region_ips)} overlapping insertion points")
 
+    @_requires_init
+    def get_resolved_insertion_points(
+        self, best: bool = True, verbose: bool = False
+    ) -> set[ResolvedInsertionPoint]:
+        """Compute Q/DQ insertion points for the best schemes without exporting the model.
+
+        Args:
+            best: If True, use the best scheme for each region. If False, use the current scheme.
+            verbose: If True, log matched-region counts and per-region insertion point details.
+
+        Returns:
+            Set of ResolvedInsertionPoint objects representing where Q/DQ pairs should be inserted.
+
+        Raises:
+            AutotunerNotInitializedError: If initialize() hasn't been called
+        """
+        resolved_insertion_points: set[ResolvedInsertionPoint] = set()
+        matched_regions = 0
+
+        if verbose:
+            logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions")
+
+        for region in self.regions:
+            current_scheme, pattern = self._resolve_scheme_for_region(region, best)
+            if current_scheme is None:
+                continue
+            self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern)
+            new_ips = pattern.matches(region, self.graph, current_scheme)
+            if new_ips:
+                resolved_insertion_points.update(new_ips)
+                matched_regions += 1
+                if verbose:
+                    logger.debug(f"  → Added {len(new_ips)} insertion points")
+        if verbose:
+            logger.debug(
+                f"Matched {matched_regions}/{len(self.regions)} regions, "
+                f"total {len(resolved_insertion_points)} unique insertion points"
+            )
+        return resolved_insertion_points
+
+    @_requires_init
+    def get_ort_quantization_config(
+        self,
+    ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
+        """Derive ORT quantization configuration from resolved insertion points.
+
+        Returns the four parameters consumed by INT8 and FP8 quantize() to replicate the autotuner's
+        Q/DQ placement decisions without exporting any intermediate ONNX file to disk.
+
+        Returns:
+            nodes_to_quantize: Node names that have at least one covered Q/DQ input.
+            op_types_to_quantize: Op types eligible for quantization.
+            no_quantize_inputs: List of (src_node, dst_node, tensor_name) tuples for inputs
+              of quantized nodes that should NOT receive Q/DQ.
+            op_types_needing_output_quant: Producer op types whose output feeds a covered
+              activation-op input (needed so ORT inserts Q/DQ between e.g. Add and Relu).
+
+        Raises:
+            AutotunerNotInitializedError: If initialize() hasn't been called.
+        """
+        resolved_ips = self.get_resolved_insertion_points(best=True)
+        graph = self.graph
+
+        # Build (node_index, input_index) pairs that have Q/DQ
+        covered: set[tuple[int, int]] = set()
+        for ip in resolved_ips:
+            if ip.node_index is not None and ip.input_index is not None:
+                covered.add((ip.node_index, ip.input_index))
+            else:
+                # Tensor-level insertion point: expand to all consumer (node, input) pairs
+                for consumer_idx in graph.tensor_users_map.get(ip.tensor_name, []):
+                    node = graph.nodes[consumer_idx]
+                    for inp_idx, inp in enumerate(node.inputs):
+                        if getattr(inp, "name", None) == ip.tensor_name:
+                            covered.add((consumer_idx, inp_idx))
+
+        quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered}
+        nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices]
+        op_types_to_quantize = list(get_autotuner_quantizable_ops())
+
+        # Inputs of quantized nodes NOT covered by Q/DQ (only non-constant producer inputs)
+        no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] = []
+        for node_idx in quantized_node_indices:
+            node = graph.nodes[node_idx]
+            for inp_idx, inp in enumerate(node.inputs):
+                if (node_idx, inp_idx) not in covered and getattr(inp, "name", None):
+                    if inp.inputs:
+                        no_quantize_inputs.append((inp.inputs[0], node, inp.name))
+
+        # Producer op types whose output feeds a covered activation-op input
+        op_types_needing_output_quant: set[str] = set()
+        for node_idx, inp_idx in covered:
+            node = graph.nodes[node_idx]
+            if node.op in get_activation_ops():
+                tensor = node.inputs[inp_idx]
+                if tensor.inputs:
+                    op_types_needing_output_quant.add(tensor.inputs[0].op)
+
+        return (
+            nodes_to_quantize,
+            op_types_to_quantize,
+            no_quantize_inputs,
+            list(op_types_needing_output_quant),
+        )
+
     @_requires_init
     def export_onnx(
         self, output_path: str | None = None, insert_qdq: bool = True, best: bool = False
@@ -469,29 +577,7 @@ def export_onnx(
         )
 
         if insert_qdq:
-            matched_regions = 0
-
-            logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions")
-
-            for region in self.regions:
-                current_scheme, pattern = self._resolve_scheme_for_region(region, best)
-                if current_scheme is None:
-                    continue
-
-                self._exclude_overlapping_insertion_points(
-                    resolved_insertion_points, region, pattern
-                )
-
-                new_ips = pattern.matches(region, self.graph, current_scheme)
-                if new_ips:
-                    resolved_insertion_points.update(new_ips)
-                    matched_regions += 1
-                    logger.debug(f"  → Added {len(new_ips)} insertion points")
-
-            logger.debug(
-                f"Matched {matched_regions}/{len(self.regions)} regions, "
-                f"total {len(resolved_insertion_points)} unique insertion points"
-            )
+            resolved_insertion_points = self.get_resolved_insertion_points(best=best, verbose=True)
 
         unique_tensors = len(resolved_insertion_points)
 
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 184483dec..8239fc3b3 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -44,8 +44,7 @@
 import onnxslim
 
 from modelopt.onnx.logging_config import configure_logging, logger
-from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op
-from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops
+from modelopt.onnx.op_types import is_data_dependent_shape_op
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
     region_pattern_autotuning_workflow,
@@ -78,7 +77,6 @@
     QDQ_PRECISION_MIN_OPSET,
     duplicate_shared_constants,
     get_opset_version,
-    get_quantized_nodes,
     name_onnx_nodes,
     save_onnx,
 )
@@ -249,75 +247,25 @@ def _preprocess_onnx(
 
 
 def _find_nodes_to_quantize_autotune(
-    onnx_path: str,
     onnx_model: onnx.ModelProto,
     quantize_mode: str,
     trt_plugins: list[str],
     high_precision_dtype: str = "fp16",
-    intermediate_generated_files: list[str] | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
-    if intermediate_generated_files is None:
-        intermediate_generated_files = []
 
     # Initialize Autotuner with the Python 'tensorrt' package
     init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins)
     precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
+
+    # Get Autotuner Q/DQ node placements
     autotuner = region_pattern_autotuning_workflow(
         onnx_model,
         quant_type=quantize_mode,
         default_dq_dtype=precision_map[high_precision_dtype],
         keep_output_dir=False,
     )
-
-    # Export model with Q/DQ insertion
-    onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx")
-    onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True)
-    intermediate_generated_files.append(onnx_path_autotune)
-
-    # Get nodes and op types to quantize
-    onnx_model_autotune = onnx.load_from_string(onnx_bytes)
-    nodes_to_quantize_autotune = get_quantized_nodes(onnx_model_autotune)
-    nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune]
-    op_types_to_quantize = list(get_autotuner_quantizable_ops())
-
-    # Get non-quantizable tensors and identify op types whose outputs are quantized.
-    # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name)
-    no_quantize_inputs = []
-    # List of ops to enable output quantization.
-    #   By default, all ONNX standard ops have output quantization disabled due to TensorRT's quantization recipe
-    #   (inputs and weights only). However, this causes QDQRemovableActivation (used for Relu, Sigmoid, etc.) to exit
-    #   early when it checks is_tensor_quantized() on its input, producing no Q/DQ between e.g. Add and Relu. This list
-    #   will be used in configure_ort() to enable output quantization of the ops included in it.
-    op_types_needing_output_quant = set()
-    for node in nodes_to_quantize_autotune:
-        for idx, inp in enumerate(node.inputs):
-            if inp.inputs and inp.inputs[0].op != "DequantizeLinear":
-                src_node = node.i(idx)
-                no_quantize_inputs.append((src_node, node, inp.name))
-            elif (
-                inp.inputs
-                and inp.inputs[0].op == "DequantizeLinear"
-                and node.op in get_activation_ops()
-            ):
-                # Trace back through DQ → Q to find the node whose output is being quantized.
-                # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer
-                dq_node = inp.inputs[0]
-                quantized_tensor = dq_node.inputs[0]  # Q's output (= DQ's input)
-                if quantized_tensor.inputs:
-                    q_node = quantized_tensor.inputs[0]  # QuantizeLinear node
-                    if q_node.op == "QuantizeLinear" and q_node.inputs:
-                        original_tensor = q_node.inputs[0]  # e.g. Add_output_0
-                        if original_tensor.inputs:
-                            producer = original_tensor.inputs[0]  # e.g. Add
-                            op_types_needing_output_quant.add(producer.op)
-
-    return (
-        nodes_to_quantize_autotune_names,
-        op_types_to_quantize,
-        no_quantize_inputs,
-        list(op_types_needing_output_quant),
-    )
+    return autotuner.get_ort_quantization_config()
 
 
 def quantize(
@@ -596,12 +544,10 @@ def quantize(
                 no_quantize_inputs,
                 op_types_needing_output_quant,
             ) = _find_nodes_to_quantize_autotune(
-                onnx_path,
                 onnx_model,
                 quantize_mode,
                 trt_plugins,
                 high_precision_dtype,
-                intermediate_generated_files,
             )
             nodes_to_quantize.extend(nodes_to_quantize_autotune)
             kwargs["no_quantize_inputs"] = no_quantize_inputs

From 42a0bdfe1f37ae61b470ba6b1e83f7a17c5323f0 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 13:39:58 -0500
Subject: [PATCH 25/42] Add support for Add->Q/DQ->Relu patterns by including
 those 'Add' nodes in the nodes_to_quantize

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../onnx/quantization/autotune/autotuner_base.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py
index e1ea3c29e..1d5b18f14 100644
--- a/modelopt/onnx/quantization/autotune/autotuner_base.py
+++ b/modelopt/onnx/quantization/autotune/autotuner_base.py
@@ -441,7 +441,7 @@ def _exclude_overlapping_insertion_points(
     def get_resolved_insertion_points(
         self, best: bool = True, verbose: bool = False
     ) -> set[ResolvedInsertionPoint]:
-        """Compute Q/DQ insertion points for the best schemes without exporting the model.
+        """Compute Q/DQ insertion points for the best schemes (assuming best=True).
 
         Args:
             best: If True, use the best scheme for each region. If False, use the current scheme.
@@ -513,7 +513,20 @@ def get_ort_quantization_config(
                         if getattr(inp, "name", None) == ip.tensor_name:
                             covered.add((consumer_idx, inp_idx))
 
+        # Nodes that consume a covered (DQ-fed) input
         quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered}
+
+        # Also include producer nodes of covered inputs: a producer whose output feeds a
+        # covered slot needs to be in nodes_to_quantize so ORT can place Q on its output
+        # (e.g., Add must be included when Q/DQ sits between Add and Relu).
+        node_name_to_idx = {node.name: i for i, node in enumerate(graph.nodes)}
+        for node_idx, inp_idx in covered:
+            tensor = graph.nodes[node_idx].inputs[inp_idx]
+            if tensor.inputs:
+                producer_idx = node_name_to_idx.get(tensor.inputs[0].name)
+                if producer_idx is not None:
+                    quantized_node_indices.add(producer_idx)
+
         nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices]
         op_types_to_quantize = list(get_autotuner_quantizable_ops())
 
@@ -527,6 +540,7 @@ def get_ort_quantization_config(
                         no_quantize_inputs.append((inp.inputs[0], node, inp.name))
 
         # Producer op types whose output feeds a covered activation-op input
+        # (e.g., to support Add->Q/DQ->Relu patterns)
         op_types_needing_output_quant: set[str] = set()
         for node_idx, inp_idx in covered:
             node = graph.nodes[node_idx]

From a70dbd3be8f61ed99f80111d422415e366134c37 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 13:43:37 -0500
Subject: [PATCH 26/42] Add integration test

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../onnx/quantization/autotune/models.py      | 52 ++++++++++
 .../test_autotune_quantization_integration.py | 98 +++++++++++++++++++
 2 files changed, 150 insertions(+)
 create mode 100644 tests/gpu/onnx/quantization/test_autotune_quantization_integration.py

diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py
index fc63f6690..e230b4e97 100644
--- a/tests/_test_utils/onnx/quantization/autotune/models.py
+++ b/tests/_test_utils/onnx/quantization/autotune/models.py
@@ -20,6 +20,8 @@
 """
 
 import onnx
+import torch
+import torch.nn as nn
 from onnx import helper
 
 
@@ -52,3 +54,53 @@ def _create_simple_conv_onnx_model():
         ],
     )
     return helper.make_model(graph, producer_name="test")
+
+
+def _create_simple_resnet18_onnx_model():  #  -> onnx.ModelProto:
+    """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests.
+
+    Architecture:
+        Conv(3→64, 7×7, stride=2) → ReLU → MaxPool(3×3, stride=2)
+        → BasicBlock(64→64) → BasicBlock(64→64)
+
+    Input shape: [1, 3, 1024, 1024], output shape: [1, 64, 256, 256].
+    """
+
+    class _BasicBlock(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(64, 64, 3, padding=1, bias=True)
+            self.act1 = nn.ReLU()
+            self.conv2 = nn.Conv2d(64, 64, 3, padding=1, bias=True)
+            self.act2 = nn.ReLU()
+
+        def forward(self, x):
+            return self.act2(self.conv2(self.act1(self.conv1(x))) + x)
+
+    class _Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=True)
+            self.act1 = nn.ReLU()
+            self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+            self.layer1 = nn.Sequential(_BasicBlock(), _BasicBlock())
+
+        def forward(self, x):
+            return self.layer1(self.maxpool(self.act1(self.conv1(x))))
+
+    torch.manual_seed(42)
+    model = _Model().eval()
+    input_tensor = torch.zeros(1, 3, 1024, 1024)
+
+    return model, input_tensor
+    # buf = io.BytesIO()
+    # torch.onnx.export(
+    #     model,
+    #     dummy_input,
+    #     buf,
+    #     input_names=["input"],
+    #     output_names=["output"],
+    #     opset_version=17,
+    # )
+    # buf.seek(0)
+    # return onnx.load_from_string(buf.read())
diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
new file mode 100644
index 000000000..a38bde128
--- /dev/null
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from unittest.mock import patch
+
+import onnx
+import onnx_graphsurgeon as gs
+from _test_utils.import_helper import skip_if_no_tensorrt
+from _test_utils.onnx.lib_test_models import export_as_onnx
+from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_onnx_model
+
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+from modelopt.onnx.quantization.quantize import _preprocess_onnx, quantize
+
+skip_if_no_tensorrt()
+
+
+def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]:
+    """Return (node_name, input_index) for every DQ-fed input slot in the model."""
+    graph = gs.import_onnx(model)
+    return {
+        (node.name, inp_idx)
+        for node in graph.nodes
+        for inp_idx, inp in enumerate(node.inputs)
+        if inp.inputs and inp.inputs[0].op == "DequantizeLinear"
+    }
+
+
+def test_autotune_quantization_integration(tmp_path="./"):
+    """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune.
+
+    Runs the autotuner once to obtain a fixed set of insertion points. The same
+    autotuner instance is then injected into quantize() via patching so that both
+    sides reflect identical placement decisions without a second TRT profiling run.
+
+    Compares the set of (node_name, input_index) pairs where a DQ node feeds the
+    input between:
+    - the autotuner's own export (via export_onnx), and
+    - the quantize(autotune=True) output model.
+    """
+    model_torch, input_tensor = _create_simple_resnet18_onnx_model()
+    onnx_path = os.path.join(tmp_path, "model.onnx")
+    output_path = onnx_path.replace(".onnx", ".quant.onnx")
+
+    # Export torch model to ONNX
+    export_as_onnx(model_torch, input_tensor, onnx_filename=onnx_path)
+
+    # Load and pre-process ONNX
+    onnx_path, onnx_model, *_ = _preprocess_onnx(
+        onnx_path,
+        use_external_data_format=False,
+        output_path=output_path,
+        enable_shared_constants_duplication=True,
+        trt_plugins=None,
+        trt_plugins_precision=None,
+        override_shapes=None,  # type: ignore[arg-type]
+        quantize_mode="int8",
+    )
+
+    # Run autotune once to get a determined set of placement decisions.
+    init_benchmark_instance(use_trtexec=False)
+    autotuner = region_pattern_autotuning_workflow(
+        onnx_model,
+        quant_type="int8",
+        default_dq_dtype="float16",
+        keep_output_dir=False,
+    )
+
+    # Autotune path: export the Q/DQ model directly and collect quantized tensor slots.
+    autotune_model = onnx.load_from_string(autotuner.export_onnx(best=True))
+    autotune_tensors = _quantized_tensor_indices(autotune_model)
+
+    # MOQ + Autotune path: inject the same autotuner so placement decisions are identical,
+    # then run the full quantize() pipeline and collect quantized tensor slots.
+    with patch(
+        "modelopt.onnx.quantization.quantize.region_pattern_autotuning_workflow",
+        return_value=autotuner,
+    ):
+        quantize(onnx_path, autotune=True, output_path=output_path)
+
+    moq_tensors = _quantized_tensor_indices(onnx.load(output_path))
+    assert autotune_tensors == moq_tensors

From e1c8af7fcf22220ffe4226ab895d2ec80b303db5 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 14:25:13 -0500
Subject: [PATCH 27/42] Remove 'keep_output_dir' arg (no longer needed due to
 tmp path)

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/workflows.py | 9 ---------
 modelopt/onnx/quantization/quantize.py           | 1 -
 2 files changed, 10 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index 6dd84d4c1..a87dcf8b9 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -20,7 +20,6 @@
 """
 
 import fnmatch
-import shutil
 import tempfile
 from pathlib import Path
 
@@ -170,7 +169,6 @@ def region_pattern_autotuning_workflow(
     qdq_baseline_model: str | None = None,
     node_filter_list: list[str] | None = None,
     verbose: bool = False,
-    keep_output_dir: bool = True,
 ) -> QDQAutotuner:
     """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model.
 
@@ -214,7 +212,6 @@ def region_pattern_autotuning_workflow(
         node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions
                          without any matching nodes are skipped during autotuning (default: None)
         verbose: Enable verbose logging in Config for detailed autotuner output (default: False)
-        keep_output_dir: If True, keep output_dir, otherwise, remove it at the end of this function.
 
     Returns:
         QDQAutotuner instance after autotuning
@@ -383,10 +380,4 @@ def region_pattern_autotuning_workflow(
     logger.debug(f"  Logs: {logs_dir}")
     logger.debug(f"  Region models: {models_dir}")
 
-    if not keep_output_dir:
-        logger.debug(
-            f"Removing output dir: {output_dir}. Set 'keep_output_dir=True' if you wish to keep it."
-        )
-        shutil.rmtree(output_dir)
-
     return autotuner
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 8239fc3b3..88acb7796 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -263,7 +263,6 @@ def _find_nodes_to_quantize_autotune(
         onnx_model,
         quant_type=quantize_mode,
         default_dq_dtype=precision_map[high_precision_dtype],
-        keep_output_dir=False,
     )
     return autotuner.get_ort_quantization_config()
 

From 159b9f25b0e47f8c24f096dfee607d8e298aab39 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 14:51:02 -0500
Subject: [PATCH 28/42] Remove 'get_quantized_nodes' and other comments that
 are no longer needed

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/utils.py                        | 19 -------------------
 .../onnx/quantization/autotune/models.py      | 11 -----------
 2 files changed, 30 deletions(-)

diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
index e1e9715f1..4025ea065 100644
--- a/modelopt/onnx/utils.py
+++ b/modelopt/onnx/utils.py
@@ -172,25 +172,6 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto):
     return [inp for inp in graph.inputs if any(isinstance(s, str) or s <= 0 for s in inp.shape)]
 
 
-def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list:
-    """This function returns the nodes preceded by a DQ node or followed by a Q node.
-
-    Args:
-        onnx_model: ONNX model to traverse.
-
-    Returns:
-        List of quantized nodes (input or output).
-    """
-    graph = gs.import_onnx(onnx_model)
-
-    return [
-        node
-        for node in graph.nodes
-        if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs)
-        or any(out.outputs[0].op == "QuantizeLinear" for out in node.outputs if out.outputs)
-    ]
-
-
 def _get_all_shapes(container: Any) -> dict[str, list[int]]:
     """This method returns the shape of tensors within a RepeatedCompositeContainer.
 
diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py
index e230b4e97..68342b9ff 100644
--- a/tests/_test_utils/onnx/quantization/autotune/models.py
+++ b/tests/_test_utils/onnx/quantization/autotune/models.py
@@ -93,14 +93,3 @@ def forward(self, x):
     input_tensor = torch.zeros(1, 3, 1024, 1024)
 
     return model, input_tensor
-    # buf = io.BytesIO()
-    # torch.onnx.export(
-    #     model,
-    #     dummy_input,
-    #     buf,
-    #     input_names=["input"],
-    #     output_names=["output"],
-    #     opset_version=17,
-    # )
-    # buf.seek(0)
-    # return onnx.load_from_string(buf.read())

From 51df98273576d45182d40db268a405545fcf544a Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 3 Mar 2026 15:16:20 -0500
Subject: [PATCH 29/42] Added docstring for 'default_dq_dtype' in workflows.py

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/workflows.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index a87dcf8b9..483cf1314 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -206,6 +206,7 @@ def region_pattern_autotuning_workflow(
                    uses <output_dir>/autotuner_state.yaml (default: None)
         quant_type: Quantization data type - "int8" for INT8 quantization (default),
                    "fp8" for FP8 quantization
+        default_dq_dtype: Dtype for DequantizeLinear output; "float32" (default) or "float16".
         qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided,
                            extracts Q/DQ insertion patterns and adds them to pattern cache
                            for warm-start (default: None)

From 1dc03cd264a1d4f352379a24d6e50b12b6e7b2e1 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Wed, 4 Mar 2026 20:38:50 -0500
Subject: [PATCH 30/42] Added mode presets and additional autotune
 configurations

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/__main__.py        | 139 +++++++++++++++++-
 .../onnx/quantization/autotune/__init__.py    |   6 +
 .../onnx/quantization/autotune/__main__.py    |  34 +++--
 modelopt/onnx/quantization/quantize.py        |  55 ++++++-
 4 files changed, 217 insertions(+), 17 deletions(-)

diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index 433980249..4ba4f072b 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -20,6 +20,11 @@
 
 import numpy as np
 
+from modelopt.onnx.quantization.autotune import (
+    MODE_PRESETS,
+    _StoreWithExplicitFlag,
+    get_node_filter_list,
+)
 from modelopt.onnx.quantization.quantize import quantize
 
 __all__ = ["main"]
@@ -297,14 +302,126 @@ def get_parser() -> argparse.ArgumentParser:
     )
     argparser.add_argument(
         "--autotune",
+        nargs="?",
+        const="default",
+        default=None,
+        choices=["quick", "default", "extensive"],
+        help=(
+            "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes."
+            "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
+            "  - 'quick': fewer schemes and benchmark runs for for quick exploration;"
+            "  - 'default': balanced, recommended for most cases;"
+            "  - 'extensive': more schemes and runs for extensive search and thorough tuning."
+            "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
+        ),
+    )
+
+    autotune_group = argparser.add_argument_group(
+        "Autotune (only applicable when --autotune is set)"
+    )
+    autotune_group.add_argument(
+        "--autotune_output_dir",
+        type=str,
+        default=None,
+        help="Output directory for autotune results (state file, logs). Default: temp directory.",
+    )
+    autotune_group.add_argument(
+        "--autotune_schemes_per_region",
+        type=int,
+        default=30,
+        help="Number of Q/DQ schemes to test per region.",
+        action=_StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_schemes_per_region",
+    )
+    autotune_group.add_argument(
+        "--autotune_pattern_cache",
+        type=str,
+        default=None,
+        dest="autotune_pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_state_file",
+        type=str,
+        default=None,
+        help="State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).",
+    )
+    autotune_group.add_argument(
+        "--autotune_node_filter_list",
+        type=str,
+        default=None,
+        help=(
+            "Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+            "Regions without any matching nodes are skipped during autotuning."
+        ),
+    )
+    autotune_group.add_argument(
+        "--autotune_verbose",
+        action="store_true",
+        help="Enable verbose logging in the autotuner.",
+    )
+    autotune_group.add_argument(
+        "--autotune_use_trtexec",
         action="store_true",
+        help="Use trtexec for benchmarking instead of the TensorRT Python API.",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_cache",
+        type=str,
+        default=None,
+        help="TensorRT timing cache file for faster engine builds.",
+    )
+    autotune_group.add_argument(
+        "--autotune_warmup_runs",
+        type=int,
+        default=5,
+        help="Number of warmup runs before timing.",
+        action=_StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_warmup_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_runs",
+        type=int,
+        default=20,
+        help="Number of timed runs for latency measurement.",
+        action=_StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_timing_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_trtexec_args",
+        type=str,
+        default=None,
         help=(
-            "If set, detect optimal Q/DQ node placements according to the TensorRT version and platform available."
+            "Additional trtexec arguments as a single quoted string. "
+            "Example: --autotune_trtexec_args '--fp16 --workspace=4096'"
         ),
     )
     return argparser
 
 
+def apply_mode_presets(args) -> None:
+    """Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs.
+
+    Only applies preset for an option when that option was not explicitly set on the
+    command line (explicit flags override the preset).
+    """
+    if args.autotune not in MODE_PRESETS:
+        return
+    preset = MODE_PRESETS[args.autotune]
+    if not getattr(args, "_explicit_autotune_schemes_per_region", False):
+        args.autotune_schemes_per_region = preset["schemes_per_region"]
+    if not getattr(args, "_explicit_autotune_warmup_runs", False):
+        args.autotune_warmup_runs = preset["warmup_runs"]
+    if not getattr(args, "_explicit_autotune_timing_runs", False):
+        args.autotune_timing_runs = preset["timing_runs"]
+
+
 def main():
     """Command-line entrypoint for ONNX PTQ."""
     args = get_parser().parse_args()
@@ -338,6 +455,12 @@ def main():
             else:
                 raise
 
+    # Autotune configs
+    autotune_enabled = args.autotune is not None
+    if autotune_enabled:
+        apply_mode_presets(args)
+    autotune_node_filter_list = get_node_filter_list(args.autotune_node_filter_list)
+
     quantize(
         args.onnx_path,
         quantize_mode=args.quantize_mode,
@@ -369,7 +492,19 @@ def main():
         calibrate_per_node=args.calibrate_per_node,
         direct_io_types=args.direct_io_types,
         opset=args.opset,
-        autotune=args.autotune,
+        autotune=autotune_enabled,
+        autotune_output_dir=args.autotune_output_dir,
+        autotune_num_schemes_per_region=args.autotune_schemes_per_region,
+        autotune_pattern_cache_file=args.autotune_pattern_cache_file,
+        autotune_state_file=args.autotune_state_file,
+        autotune_qdq_baseline=args.autotune_qdq_baseline,
+        autotune_node_filter_list=autotune_node_filter_list,
+        autotune_verbose=args.autotune_verbose,
+        autotune_use_trtexec=args.autotune_use_trtexec,
+        autotune_timing_cache=args.autotune_timing_cache,
+        autotune_warmup_runs=args.autotune_warmup_runs,
+        autotune_timing_runs=args.autotune_timing_runs,
+        autotune_trtexec_args=args.autotune_trtexec_args,
     )
 
 
diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py
index 7f14bb360..8243cfbef 100644
--- a/modelopt/onnx/quantization/autotune/__init__.py
+++ b/modelopt/onnx/quantization/autotune/__init__.py
@@ -20,6 +20,9 @@
 region analysis to efficiently explore and optimize Q/DQ insertion strategies.
 """
 
+# Expose Autotune modes and args
+from .__main__ import MODE_PRESETS, _StoreWithExplicitFlag, get_node_filter_list
+
 # Core data structures
 from .autotuner import QDQAutotuner
 from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
@@ -44,6 +47,7 @@
 from .region_search import CombinedRegionSearch
 
 __all__ = [
+    "MODE_PRESETS",
     "AutotunerError",
     "AutotunerNotInitializedError",
     "ChildRegionInputInsertionPoint",
@@ -62,4 +66,6 @@
     "ResolvedInsertionPoint",
     "TensorRTPyBenchmark",
     "TrtExecBenchmark",
+    "_StoreWithExplicitFlag",
+    "get_node_filter_list",
 ]
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index cb7b3c281..9b4a2cd53 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -116,6 +116,27 @@ def log_benchmark_config(args):
         logger.info(f"  Trtexec args: {args.trtexec_benchmark_args}")
 
 
+def get_node_filter_list(node_filter_list_path: str) -> list | None:
+    """Extract node filter list from node filters path.
+
+    Args:
+        node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line).
+
+    Returns:
+        Node filter list
+    """
+    node_filter_list = None
+    if node_filter_list_path:
+        filter_file = validate_file_path(node_filter_list_path, "Node filter list file")
+        if filter_file:
+            with open(filter_file) as f:
+                node_filter_list = [
+                    line.strip() for line in f if line.strip() and not line.strip().startswith("#")
+                ]
+            logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+    return node_filter_list
+
+
 def run_autotune() -> int:
     """Execute the complete pattern-based Q/DQ autotuning workflow.
 
@@ -155,18 +176,7 @@ def run_autotune() -> int:
         return 1
 
     try:
-        node_filter_list = None
-        if args.node_filter_list:
-            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
-            if filter_file:
-                with open(filter_file) as f:
-                    node_filter_list = [
-                        line.strip()
-                        for line in f
-                        if line.strip() and not line.strip().startswith("#")
-                    ]
-                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
-
+        node_filter_list = get_node_filter_list(args.node_filter_list)
         region_pattern_autotuning_workflow(
             model_path=str(model_path),
             output_dir=output_dir,
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 88acb7796..397147ac1 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -36,6 +36,7 @@
 import shutil
 import tempfile
 from collections.abc import Sequence
+from pathlib import Path
 from typing import Any
 
 import onnx
@@ -251,18 +252,42 @@ def _find_nodes_to_quantize_autotune(
     quantize_mode: str,
     trt_plugins: list[str],
     high_precision_dtype: str = "fp16",
+    output_dir: str | None = None,
+    num_schemes_per_region: int = 30,
+    pattern_cache_file: str | None = None,
+    state_file: str | None = None,
+    qdq_baseline_model: str | None = None,
+    node_filter_list: list[str] | None = None,
+    verbose: bool = False,
+    use_trtexec: bool = False,
+    timing_cache_file: str | None = None,
+    warmup_runs: int = 5,
+    timing_runs: int = 20,
+    trtexec_args: str | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
 
-    # Initialize Autotuner with the Python 'tensorrt' package
-    init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins)
+    init_benchmark_instance(
+        use_trtexec=use_trtexec,
+        plugin_libraries=trt_plugins,
+        timing_cache_file=timing_cache_file,
+        warmup_runs=warmup_runs,
+        timing_runs=timing_runs,
+        trtexec_args=trtexec_args.split() if trtexec_args else None,
+    )
     precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
 
-    # Get Autotuner Q/DQ node placements
     autotuner = region_pattern_autotuning_workflow(
         onnx_model,
+        output_dir=Path(output_dir) if output_dir else None,
+        num_schemes_per_region=num_schemes_per_region,
+        pattern_cache_file=pattern_cache_file,
+        state_file=state_file,
         quant_type=quantize_mode,
         default_dq_dtype=precision_map[high_precision_dtype],
+        qdq_baseline_model=qdq_baseline_model,
+        node_filter_list=node_filter_list,
+        verbose=verbose,
     )
     return autotuner.get_ort_quantization_config()
 
@@ -301,6 +326,18 @@ def quantize(
     direct_io_types: bool = False,
     opset: int | None = None,
     autotune: bool = False,
+    autotune_output_dir: str | None = None,
+    autotune_num_schemes_per_region: int = 30,
+    autotune_pattern_cache_file: str | None = None,
+    autotune_state_file: str | None = None,
+    autotune_qdq_baseline: str | None = None,
+    autotune_node_filter_list: list[str] | None = None,
+    autotune_verbose: bool = False,
+    autotune_use_trtexec: bool = False,
+    autotune_timing_cache: str | None = None,
+    autotune_warmup_runs: int = 5,
+    autotune_timing_runs: int = 20,
+    autotune_trtexec_args: str | None = None,
     **kwargs: Any,
 ) -> None:
     """Quantizes the provided ONNX model.
@@ -547,6 +584,18 @@ def quantize(
                 quantize_mode,
                 trt_plugins,
                 high_precision_dtype,
+                output_dir=autotune_output_dir,
+                num_schemes_per_region=autotune_num_schemes_per_region,
+                pattern_cache_file=autotune_pattern_cache_file,
+                state_file=autotune_state_file,
+                qdq_baseline_model=autotune_qdq_baseline,
+                node_filter_list=autotune_node_filter_list,
+                verbose=autotune_verbose,
+                use_trtexec=autotune_use_trtexec,
+                timing_cache_file=autotune_timing_cache,
+                warmup_runs=autotune_warmup_runs,
+                timing_runs=autotune_timing_runs,
+                trtexec_args=autotune_trtexec_args,
             )
             nodes_to_quantize.extend(nodes_to_quantize_autotune)
             kwargs["no_quantize_inputs"] = no_quantize_inputs

From ddacbcb4cb999e774f73cb96ae39703288f580fb Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:03:21 -0500
Subject: [PATCH 31/42] Fixed tmp_path in test

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../quantization/test_autotune_quantization_integration.py   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
index a38bde128..86a377bd7 100644
--- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from unittest.mock import patch
 
 import onnx
@@ -42,7 +41,7 @@ def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]:
     }
 
 
-def test_autotune_quantization_integration(tmp_path="./"):
+def test_autotune_quantization_integration(tmp_path):
     """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune.
 
     Runs the autotuner once to obtain a fixed set of insertion points. The same
@@ -55,7 +54,7 @@ def test_autotune_quantization_integration(tmp_path="./"):
     - the quantize(autotune=True) output model.
     """
     model_torch, input_tensor = _create_simple_resnet18_onnx_model()
-    onnx_path = os.path.join(tmp_path, "model.onnx")
+    onnx_path = tmp_path / "model.onnx"
     output_path = onnx_path.replace(".onnx", ".quant.onnx")
 
     # Export torch model to ONNX

From 689a90781efa2887b3d6bb7f9420c58fcdf65316 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:27:30 -0500
Subject: [PATCH 32/42] Fixed copilot comments

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py                          | 2 +-
 tests/_test_utils/onnx/quantization/autotune/models.py          | 2 +-
 .../onnx/quantization/test_autotune_quantization_integration.py | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 397147ac1..c8930283a 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -250,7 +250,7 @@ def _preprocess_onnx(
 def _find_nodes_to_quantize_autotune(
     onnx_model: onnx.ModelProto,
     quantize_mode: str,
-    trt_plugins: list[str],
+    trt_plugins: list[str] | None,
     high_precision_dtype: str = "fp16",
     output_dir: str | None = None,
     num_schemes_per_region: int = 30,
diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py
index 68342b9ff..84a8b4ab8 100644
--- a/tests/_test_utils/onnx/quantization/autotune/models.py
+++ b/tests/_test_utils/onnx/quantization/autotune/models.py
@@ -56,7 +56,7 @@ def _create_simple_conv_onnx_model():
     return helper.make_model(graph, producer_name="test")
 
 
-def _create_simple_resnet18_onnx_model():  #  -> onnx.ModelProto:
+def _create_simple_resnet18_model():
     """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests.
 
     Architecture:
diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
index 86a377bd7..6e889e131 100644
--- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -78,7 +78,6 @@ def test_autotune_quantization_integration(tmp_path):
         onnx_model,
         quant_type="int8",
         default_dq_dtype="float16",
-        keep_output_dir=False,
     )
 
     # Autotune path: export the Q/DQ model directly and collect quantized tensor slots.

From b64322fcae17ca20e469854ef3fec5a8bf33e414 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:41:27 -0500
Subject: [PATCH 33/42] Fix: skip rewiring in graph_utils if no index is found.
 This prevents silent corruption of the graph.

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/graph_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index 9ef88d4a9..131723e61 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -633,7 +633,17 @@ def remove_partial_input_qdq(
                 for idx, inp in enumerate(target_node_in_graph.inputs)
                 if inp.name == dq_output.name
             ]
-            target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0
+            # If no input index is found (dq_output is not actually connected to target node), skip rewiring to
+            # prevent silent corruption of the graph.
+            if not target_input_idx_arr:
+                logger.warning(
+                    "Expected DequantizeLinear output '%s' to be an input of node '%s', "
+                    "but no matching input was found. Skipping Q/DQ bypass for this edge.",
+                    dq_output.name,
+                    target_node_in_graph.name,
+                )
+                continue
+            target_input_idx = target_input_idx_arr[0]
 
             # Connect the target's input directly to source_node's output (bypass Q/DQ)
             target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0]

From 0a32bea36e93237ec3b34492329ef1dd08d958fb Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Thu, 5 Mar 2026 12:30:37 -0500
Subject: [PATCH 34/42] Match args for preset mode default

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index 4ba4f072b..961ec0d5e 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -328,7 +328,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_schemes_per_region",
         type=int,
-        default=30,
+        default=50,
         help="Number of Q/DQ schemes to test per region.",
         action=_StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_schemes_per_region",
@@ -380,7 +380,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_warmup_runs",
         type=int,
-        default=5,
+        default=50,
         help="Number of warmup runs before timing.",
         action=_StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_warmup_runs",
@@ -388,7 +388,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_timing_runs",
         type=int,
-        default=20,
+        default=100,
         help="Number of timed runs for latency measurement.",
         action=_StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_timing_runs",

From 7730b514782c4c55fd4e9cd28e0af127e4935e9a Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 13:02:05 -0400
Subject: [PATCH 35/42] Exposed _StoreWithExplicitFlag

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/__main__.py          |  8 ++++----
 modelopt/onnx/quantization/autotune/__init__.py |  4 ++--
 modelopt/onnx/quantization/autotune/__main__.py | 10 ++++++----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index 961ec0d5e..e8ee53d76 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -22,7 +22,7 @@
 
 from modelopt.onnx.quantization.autotune import (
     MODE_PRESETS,
-    _StoreWithExplicitFlag,
+    StoreWithExplicitFlag,
     get_node_filter_list,
 )
 from modelopt.onnx.quantization.quantize import quantize
@@ -330,7 +330,7 @@ def get_parser() -> argparse.ArgumentParser:
         type=int,
         default=50,
         help="Number of Q/DQ schemes to test per region.",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_schemes_per_region",
     )
     autotune_group.add_argument(
@@ -382,7 +382,7 @@ def get_parser() -> argparse.ArgumentParser:
         type=int,
         default=50,
         help="Number of warmup runs before timing.",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_warmup_runs",
     )
     autotune_group.add_argument(
@@ -390,7 +390,7 @@ def get_parser() -> argparse.ArgumentParser:
         type=int,
         default=100,
         help="Number of timed runs for latency measurement.",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_timing_runs",
     )
     autotune_group.add_argument(
diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py
index 8243cfbef..b00e8c8f9 100644
--- a/modelopt/onnx/quantization/autotune/__init__.py
+++ b/modelopt/onnx/quantization/autotune/__init__.py
@@ -21,7 +21,7 @@
 """
 
 # Expose Autotune modes and args
-from .__main__ import MODE_PRESETS, _StoreWithExplicitFlag, get_node_filter_list
+from .__main__ import MODE_PRESETS, StoreWithExplicitFlag, get_node_filter_list
 
 # Core data structures
 from .autotuner import QDQAutotuner
@@ -64,8 +64,8 @@
     "RegionPattern",
     "RegionType",
     "ResolvedInsertionPoint",
+    "StoreWithExplicitFlag",
     "TensorRTPyBenchmark",
     "TrtExecBenchmark",
-    "_StoreWithExplicitFlag",
     "get_node_filter_list",
 ]
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index 9b4a2cd53..0b233d740 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -44,14 +44,16 @@
 }
 
 
-class _StoreWithExplicitFlag(argparse.Action):
+class StoreWithExplicitFlag(argparse.Action):
     """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
 
     def __init__(self, explicit_attr: str, *args, **kwargs):
+        """Initialize explicit attribute flag."""
         self._explicit_attr = explicit_attr
         super().__init__(*args, **kwargs)
 
     def __call__(self, parser, namespace, values, option_string=None):
+        """Set attributes."""
         setattr(namespace, self.dest, values)
         setattr(namespace, self._explicit_attr, True)
 
@@ -272,7 +274,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         type=int,
         default=DEFAULT_NUM_SCHEMES,
         dest="num_schemes",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_num_schemes",
         help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
     )
@@ -338,7 +340,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--warmup_runs",
         type=int,
         default=DEFAULT_WARMUP_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_warmup_runs",
         help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
     )
@@ -346,7 +348,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--timing_runs",
         type=int,
         default=DEFAULT_TIMING_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_timing_runs",
         help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
     )

From eb0e064bfffb9510d75aed1122ae42fcbafd7d74 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 14:24:32 -0400
Subject: [PATCH 36/42] Renamed new_ips to new_insertion_points

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/autotune/autotuner_base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py
index 1d5b18f14..6df297e95 100644
--- a/modelopt/onnx/quantization/autotune/autotuner_base.py
+++ b/modelopt/onnx/quantization/autotune/autotuner_base.py
@@ -464,12 +464,12 @@ def get_resolved_insertion_points(
             if current_scheme is None:
                 continue
             self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern)
-            new_ips = pattern.matches(region, self.graph, current_scheme)
-            if new_ips:
-                resolved_insertion_points.update(new_ips)
+            new_insertion_points = pattern.matches(region, self.graph, current_scheme)
+            if new_insertion_points:
+                resolved_insertion_points.update(new_insertion_points)
                 matched_regions += 1
                 if verbose:
-                    logger.debug(f"  → Added {len(new_ips)} insertion points")
+                    logger.debug(f"  → Added {len(new_insertion_points)} insertion points")
         if verbose:
             logger.debug(
                 f"Matched {matched_regions}/{len(self.regions)} regions, "

From 7cc54a5c0fc817ad9805a722d698fcc12da613a1 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 14:59:06 -0400
Subject: [PATCH 37/42] Address coderabbit and copilot issues + other minor
 issues

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/__main__.py        | 18 +++++----
 .../onnx/quantization/autotune/__main__.py    |  2 +-
 .../onnx/quantization/autotune/workflows.py   | 19 ++++++---
 modelopt/onnx/quantization/quantize.py        | 39 ++++++++++++++++---
 .../test_autotune_quantization_integration.py |  7 ++--
 5 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index e8ee53d76..8a71291f1 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -307,11 +307,11 @@ def get_parser() -> argparse.ArgumentParser:
         default=None,
         choices=["quick", "default", "extensive"],
         help=(
-            "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes."
+            "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. "
             "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
-            "  - 'quick': fewer schemes and benchmark runs for for quick exploration;"
-            "  - 'default': balanced, recommended for most cases;"
-            "  - 'extensive': more schemes and runs for extensive search and thorough tuning."
+            "  - 'quick': fewer schemes and benchmark runs for quick exploration; "
+            "  - 'default': balanced, recommended for most cases; "
+            "  - 'extensive': more schemes and runs for extensive search and thorough tuning. "
             "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
         ),
     )
@@ -328,7 +328,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_schemes_per_region",
         type=int,
-        default=50,
+        default=MODE_PRESETS["default"]["schemes_per_region"],
         help="Number of Q/DQ schemes to test per region.",
         action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_schemes_per_region",
@@ -380,7 +380,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_warmup_runs",
         type=int,
-        default=50,
+        default=MODE_PRESETS["default"]["warmup_runs"],
         help="Number of warmup runs before timing.",
         action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_warmup_runs",
@@ -388,7 +388,7 @@ def get_parser() -> argparse.ArgumentParser:
     autotune_group.add_argument(
         "--autotune_timing_runs",
         type=int,
-        default=100,
+        default=MODE_PRESETS["default"]["timing_runs"],
         help="Number of timed runs for latency measurement.",
         action=StoreWithExplicitFlag,
         explicit_attr="_explicit_autotune_timing_runs",
@@ -459,7 +459,9 @@ def main():
     autotune_enabled = args.autotune is not None
     if autotune_enabled:
         apply_mode_presets(args)
-    autotune_node_filter_list = get_node_filter_list(args.autotune_node_filter_list)
+    autotune_node_filter_list = (
+        get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None
+    )
 
     quantize(
         args.onnx_path,
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index 0b233d740..a66585300 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -180,7 +180,7 @@ def run_autotune() -> int:
     try:
         node_filter_list = get_node_filter_list(args.node_filter_list)
         region_pattern_autotuning_workflow(
-            model_path=str(model_path),
+            model_or_path=str(model_path),
             output_dir=output_dir,
             num_schemes_per_region=args.num_schemes,
             pattern_cache_file=args.pattern_cache_file,
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index 483cf1314..190882a31 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -20,6 +20,7 @@
 """
 
 import fnmatch
+import shutil
 import tempfile
 from pathlib import Path
 
@@ -159,7 +160,7 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool:
 
 
 def region_pattern_autotuning_workflow(
-    model_path: str | onnx.ModelProto,
+    model_or_path: str | onnx.ModelProto,
     output_dir: Path | None = None,
     num_schemes_per_region: int = 30,
     pattern_cache_file: str | None = None,
@@ -196,7 +197,7 @@ def region_pattern_autotuning_workflow(
     7. Export final optimized model with best Q/DQ scheme for each pattern
 
     Args:
-        model_path: Path to ONNX model file to optimize
+        model_or_path: Path to ONNX model file to optimize
         output_dir: Directory for output files (state, logs, models). Created if it doesn't exist.
         num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern.
                                Higher values explore more configurations but take longer (default: 30)
@@ -217,6 +218,7 @@ def region_pattern_autotuning_workflow(
     Returns:
         QDQAutotuner instance after autotuning
     """
+    output_dir_is_temp = output_dir is None
     if not output_dir:
         output_dir = Path(tempfile.mkdtemp())
 
@@ -230,11 +232,11 @@ def region_pattern_autotuning_workflow(
         state_file = str(output_dir / "autotuner_state.yaml")
     state_path = Path(state_file)
 
-    if isinstance(model_path, str):
-        logger.info(f"Loading model: {model_path}")
-        model = onnx.load(model_path)
+    if isinstance(model_or_path, str):
+        logger.info(f"Loading model: {model_or_path}")
+        model = onnx.load(model_or_path)
     else:
-        model = model_path
+        model = model_or_path
 
     pattern_cache = None
     if pattern_cache_file:
@@ -381,4 +383,9 @@ def region_pattern_autotuning_workflow(
     logger.debug(f"  Logs: {logs_dir}")
     logger.debug(f"  Region models: {models_dir}")
 
+    # Remove temporary folder
+    if output_dir_is_temp and output_dir.exists():
+        shutil.rmtree(output_dir)
+        logger.info(f"Temporary directory {output_dir} was deleted!")
+
     return autotuner
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index c8930283a..889493ee3 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -46,6 +46,7 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op
+from modelopt.onnx.quantization.autotune import MODE_PRESETS
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
     region_pattern_autotuning_workflow,
@@ -267,7 +268,7 @@ def _find_nodes_to_quantize_autotune(
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
     logger.info("Running Auto Q/DQ with TensorRT")
 
-    init_benchmark_instance(
+    benchmark_instance = init_benchmark_instance(
         use_trtexec=use_trtexec,
         plugin_libraries=trt_plugins,
         timing_cache_file=timing_cache_file,
@@ -275,8 +276,10 @@ def _find_nodes_to_quantize_autotune(
         timing_runs=timing_runs,
         trtexec_args=trtexec_args.split() if trtexec_args else None,
     )
-    precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
+    if benchmark_instance is None:
+        raise RuntimeError("Failed to initialize TensorRT benchmark")
 
+    precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
     autotuner = region_pattern_autotuning_workflow(
         onnx_model,
         output_dir=Path(output_dir) if output_dir else None,
@@ -327,7 +330,7 @@ def quantize(
     opset: int | None = None,
     autotune: bool = False,
     autotune_output_dir: str | None = None,
-    autotune_num_schemes_per_region: int = 30,
+    autotune_num_schemes_per_region: int = MODE_PRESETS["default"]["schemes_per_region"],
     autotune_pattern_cache_file: str | None = None,
     autotune_state_file: str | None = None,
     autotune_qdq_baseline: str | None = None,
@@ -335,8 +338,8 @@ def quantize(
     autotune_verbose: bool = False,
     autotune_use_trtexec: bool = False,
     autotune_timing_cache: str | None = None,
-    autotune_warmup_runs: int = 5,
-    autotune_timing_runs: int = 20,
+    autotune_warmup_runs: int = MODE_PRESETS["default"]["warmup_runs"],
+    autotune_timing_runs: int = MODE_PRESETS["default"]["timing_runs"],
     autotune_trtexec_args: str | None = None,
     **kwargs: Any,
 ) -> None:
@@ -464,6 +467,32 @@ def quantize(
         autotune:
             If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available.
             If False, use the default pattern-based quantization approach.
+        autotune_output_dir:
+            Output directory for autotune results (state file, logs). Default: temp directory.
+        autotune_num_schemes_per_region:
+            Number of Q/DQ schemes to test per region.
+        autotune_pattern_cache_file:
+            Path to pattern cache YAML for warm-start.
+        autotune_qdq_baseline:
+            Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.
+        autotune_state_file:
+            State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).
+        autotune_node_filter_list:
+            Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). Regions without
+            any matching nodes are skipped during autotuning.
+        autotune_verbose:
+            Enable verbose logging in the autotuner.
+        autotune_use_trtexec:
+            Use trtexec for benchmarking instead of the TensorRT Python API.
+        autotune_timing_cache:
+            TensorRT timing cache file for faster engine builds.
+        autotune_warmup_runs:
+            Number of warmup runs before timing.
+        autotune_timing_runs:
+            Number of timed runs for latency measurement.
+        autotune_trtexec_args:
+            Additional trtexec arguments as a single quoted string.
+            Example: --autotune_trtexec_args '--fp16 --workspace=4096'
         kwargs:
             Additional keyword arguments for int4 quantization, including:
             - awqlite_alpha_step (float): Alpha step for lite, range [0, 1].
diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
index 6e889e131..7dd8c8323 100644
--- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -13,13 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from unittest.mock import patch
 
 import onnx
 import onnx_graphsurgeon as gs
 from _test_utils.import_helper import skip_if_no_tensorrt
 from _test_utils.onnx.lib_test_models import export_as_onnx
-from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_onnx_model
+from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_model
 
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
@@ -53,8 +54,8 @@ def test_autotune_quantization_integration(tmp_path):
     - the autotuner's own export (via export_onnx), and
     - the quantize(autotune=True) output model.
     """
-    model_torch, input_tensor = _create_simple_resnet18_onnx_model()
-    onnx_path = tmp_path / "model.onnx"
+    model_torch, input_tensor = _create_simple_resnet18_model()
+    onnx_path = os.path.join(tmp_path, "model.onnx")
     output_path = onnx_path.replace(".onnx", ".quant.onnx")
 
     # Export torch model to ONNX

From 8634b74f9487e82adf1f97f917a155d75d4d7615 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 15:28:31 -0400
Subject: [PATCH 38/42] Address additional coderabbit and copilot issues

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 889493ee3..7bc6000b5 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -46,11 +46,6 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op
-from modelopt.onnx.quantization.autotune import MODE_PRESETS
-from modelopt.onnx.quantization.autotune.workflows import (
-    init_benchmark_instance,
-    region_pattern_autotuning_workflow,
-)
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -254,7 +249,7 @@ def _find_nodes_to_quantize_autotune(
     trt_plugins: list[str] | None,
     high_precision_dtype: str = "fp16",
     output_dir: str | None = None,
-    num_schemes_per_region: int = 30,
+    num_schemes_per_region: int = 50,
     pattern_cache_file: str | None = None,
     state_file: str | None = None,
     qdq_baseline_model: str | None = None,
@@ -262,12 +257,17 @@ def _find_nodes_to_quantize_autotune(
     verbose: bool = False,
     use_trtexec: bool = False,
     timing_cache_file: str | None = None,
-    warmup_runs: int = 5,
-    timing_runs: int = 20,
+    warmup_runs: int = 50,
+    timing_runs: int = 100,
     trtexec_args: str | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
-    logger.info("Running Auto Q/DQ with TensorRT")
+    # Import Autotune dependencies here to avoid making 'tensorrt' and 'cuda' a module-level requirement.
+    from modelopt.onnx.quantization.autotune.workflows import (
+        init_benchmark_instance,
+        region_pattern_autotuning_workflow,
+    )
 
+    logger.info("Running Auto Q/DQ with TensorRT")
     benchmark_instance = init_benchmark_instance(
         use_trtexec=use_trtexec,
         plugin_libraries=trt_plugins,
@@ -330,7 +330,7 @@ def quantize(
     opset: int | None = None,
     autotune: bool = False,
     autotune_output_dir: str | None = None,
-    autotune_num_schemes_per_region: int = MODE_PRESETS["default"]["schemes_per_region"],
+    autotune_num_schemes_per_region: int = 50,
     autotune_pattern_cache_file: str | None = None,
     autotune_state_file: str | None = None,
     autotune_qdq_baseline: str | None = None,
@@ -338,8 +338,8 @@ def quantize(
     autotune_verbose: bool = False,
     autotune_use_trtexec: bool = False,
     autotune_timing_cache: str | None = None,
-    autotune_warmup_runs: int = MODE_PRESETS["default"]["warmup_runs"],
-    autotune_timing_runs: int = MODE_PRESETS["default"]["timing_runs"],
+    autotune_warmup_runs: int = 50,
+    autotune_timing_runs: int = 100,
     autotune_trtexec_args: str | None = None,
     **kwargs: Any,
 ) -> None:

From 0d82f64f2cb1292d8032879d06a91ea568d403f7 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 18:09:32 -0400
Subject: [PATCH 39/42] Added real scales test in the integration workflow

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py        | 15 ++++---
 .../test_autotune_quantization_integration.py | 44 ++++++++++++++++---
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 7bc6000b5..0b50aff5f 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -46,6 +46,14 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op
+
+try:
+    from modelopt.onnx.quantization.autotune.workflows import (
+        init_benchmark_instance,
+        region_pattern_autotuning_workflow,
+    )
+except ImportError:
+    logger.warning("Failed to import Autotune dependencies")
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -261,13 +269,8 @@ def _find_nodes_to_quantize_autotune(
     timing_runs: int = 100,
     trtexec_args: str | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
-    # Import Autotune dependencies here to avoid making 'tensorrt' and 'cuda' a module-level requirement.
-    from modelopt.onnx.quantization.autotune.workflows import (
-        init_benchmark_instance,
-        region_pattern_autotuning_workflow,
-    )
-
     logger.info("Running Auto Q/DQ with TensorRT")
+
     benchmark_instance = init_benchmark_instance(
         use_trtexec=use_trtexec,
         plugin_libraries=trt_plugins,
diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
index 7dd8c8323..829eebb55 100644
--- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -42,15 +42,33 @@ def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]:
     }
 
 
+def _collect_q_scales(model: onnx.ModelProto) -> dict[str, float]:
+    """Return {scale_initializer_name: float_value} for every QuantizeLinear node.
+
+    Works for both float32 and float16 scale initializers (the latter produced by
+    the fp16-conversion pass that runs after ORT calibration).
+    """
+    initializers = {init.name: init for init in model.graph.initializer}
+    scales = {}
+    for node in model.graph.node:
+        if node.op_type == "QuantizeLinear" and len(node.input) >= 2:
+            scale_name = node.input[1]
+            if scale_name in initializers:
+                raw = onnx.numpy_helper.to_array(initializers[scale_name])
+                scales[scale_name] = float(raw.flat[0])
+    return scales
+
+
 def test_autotune_quantization_integration(tmp_path):
     """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune.
 
-    Runs the autotuner once to obtain a fixed set of insertion points. The same
-    autotuner instance is then injected into quantize() via patching so that both
-    sides reflect identical placement decisions without a second TRT profiling run.
+    Also ensure that the scales in the Q/DQ nodes have been updated from standalone Autotune to MOQ with Autotune.
 
-    Compares the set of (node_name, input_index) pairs where a DQ node feeds the
-    input between:
+    Runs the autotuner once to obtain a fixed set of insertion points. The same autotuner instance is then injected
+    into quantize() via patching so that both sides reflect identical placement decisions without a second TRT
+    profiling run.
+
+    Compares the set of (node_name, input_index) pairs where a DQ node feeds the input between:
     - the autotuner's own export (via export_onnx), and
     - the quantize(autotune=True) output model.
     """
@@ -93,5 +111,21 @@ def test_autotune_quantization_integration(tmp_path):
     ):
         quantize(onnx_path, autotune=True, output_path=output_path)
 
+    # Check Q/DQ nodes placement
     moq_tensors = _quantized_tensor_indices(onnx.load(output_path))
     assert autotune_tensors == moq_tensors
+
+    # Check Q/DQ scales
+    scales_random = _collect_q_scales(autotune_model)
+    scales_calib = _collect_q_scales(onnx.load(output_path))
+    assert scales_random, "Expected at least one Q scale in the standalone Autotune model"
+    assert scales_calib, "Expected at least one Q scale in the MOQ + Autotune integrated model"
+    assert len(scales_random.keys()) == len(scales_calib.keys()), (
+        "Both models must quantize the same number of tensor"
+    )
+    assert all(
+        v != list(scales_calib.values())[idx] for idx, v in enumerate(scales_random.values())
+    ), (
+        "All or some Q/DQ scales are identical between the standalone Autotune and MOQ + Autotune integrated models. "
+        "The integrated quantization appears to have had no effect on scale computation."
+    )

From ee873304a636b93ca46f72eec26d42ec21527589 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 18:51:15 -0400
Subject: [PATCH 40/42] Address additional copilot issues: includes fix for
 op_types_to_quantize overwrite and other flags (should have the same behavior
 as pre-autotune)

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/quantize.py        | 36 +++++++++----------
 .../autotune/test_pattern_cache.py            |  1 +
 .../onnx/quantization/autotune/test_region.py |  6 ----
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index 0b50aff5f..bbc54a4c7 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -584,22 +584,21 @@ def quantize(
     # Check op types spelling in 'op_types_to_exclude' and '_to_quantize'
     validate_op_types_spelling(onnx_path, op_types_to_quantize, op_types_to_exclude)
 
-    if not autotune:
-        # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern.
-        # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to
-        # MatMuls in MHA pattern.
-        # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8
-        # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern.
-        nodes_to_exclude = find_nodes_from_mha_to_exclude(
-            onnx_path,
-            use_external_data_format,
-            nodes_to_exclude,
-            disable_mha_qdq,
-            quantize_mode,
-            intermediate_generated_files,
-            calibration_data_reader,
-            calibration_eps,
-        )
+    # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern.
+    # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to
+    # MatMuls in MHA pattern.
+    # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8
+    # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern.
+    nodes_to_exclude = find_nodes_from_mha_to_exclude(
+        onnx_path,
+        use_external_data_format,
+        nodes_to_exclude,
+        disable_mha_qdq,
+        quantize_mode,
+        intermediate_generated_files,
+        calibration_data_reader,
+        calibration_eps,
+    )
 
     if calibrate_per_node and not calibration_shapes:
         calibration_shapes = get_input_shapes(onnx_path)
@@ -608,7 +607,7 @@ def quantize(
         if autotune:
             (
                 nodes_to_quantize_autotune,
-                op_types_to_quantize,
+                op_types_to_quantize_autotune,
                 no_quantize_inputs,
                 op_types_needing_output_quant,
             ) = _find_nodes_to_quantize_autotune(
@@ -629,7 +628,8 @@ def quantize(
                 timing_runs=autotune_timing_runs,
                 trtexec_args=autotune_trtexec_args,
             )
-            nodes_to_quantize.extend(nodes_to_quantize_autotune)
+            op_types_to_quantize = op_types_to_quantize or op_types_to_quantize_autotune
+            nodes_to_quantize = nodes_to_quantize or nodes_to_quantize_autotune
             kwargs["no_quantize_inputs"] = no_quantize_inputs
             kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant
 
diff --git a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
index 294501ff0..a2d61c507 100644
--- a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
+++ b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """
 Tests for PatternCache in the autotuner.
 
diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py
index 5a733017d..34e2cd244 100644
--- a/tests/unit/onnx/quantization/autotune/test_region.py
+++ b/tests/unit/onnx/quantization/autotune/test_region.py
@@ -13,12 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Tests for the Region class in the autotuner."""
 
 import pytest

From 1a531b95e39b35033ee1004af9a99f6571179bd2 Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Mon, 9 Mar 2026 18:59:04 -0400
Subject: [PATCH 41/42] nit: added docstring and comment

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 modelopt/onnx/quantization/fp8.py      | 2 ++
 modelopt/onnx/quantization/int8.py     | 2 ++
 modelopt/onnx/quantization/quantize.py | 1 +
 3 files changed, 5 insertions(+)

diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
index e181e1864..b7146173a 100755
--- a/modelopt/onnx/quantization/fp8.py
+++ b/modelopt/onnx/quantization/fp8.py
@@ -220,6 +220,8 @@ def quantize(
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
+        # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements
+        # will be decided according to TensorRT's runtime measurements.
         logger.info("Detecting GEMV patterns for TRT optimization")
         matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude(
             onnx_path,
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
index 27c87abd4..ad2ca9558 100755
--- a/modelopt/onnx/quantization/int8.py
+++ b/modelopt/onnx/quantization/int8.py
@@ -162,6 +162,8 @@ def quantize(
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
+        # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements
+        # will be decided according to TensorRT's runtime measurements.
         logger.info("Detecting GEMV patterns for TRT optimization")
         matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude(
             onnx_path,
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index bbc54a4c7..b53904657 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -269,6 +269,7 @@ def _find_nodes_to_quantize_autotune(
     timing_runs: int = 100,
     trtexec_args: str | None = None,
 ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
+    """Extracts quantization information from Autotune to provide ORT quantization."""
     logger.info("Running Auto Q/DQ with TensorRT")
 
     benchmark_instance = init_benchmark_instance(

From ede8df0118cdfc1c5d9df19bbc22f59ca50f94fe Mon Sep 17 00:00:00 2001
From: gcunhase <4861122+gcunhase@users.noreply.github.com>
Date: Tue, 10 Mar 2026 13:15:13 -0400
Subject: [PATCH 42/42] Created autotune utils

Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
---
 .../onnx/quantization/autotune/__init__.py    |  5 +-
 .../onnx/quantization/autotune/__main__.py    | 64 ++-------------
 modelopt/onnx/quantization/autotune/utils.py  | 81 +++++++++++++++++++
 3 files changed, 89 insertions(+), 61 deletions(-)
 create mode 100644 modelopt/onnx/quantization/autotune/utils.py

diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py
index b00e8c8f9..74f44f972 100644
--- a/modelopt/onnx/quantization/autotune/__init__.py
+++ b/modelopt/onnx/quantization/autotune/__init__.py
@@ -20,8 +20,8 @@
 region analysis to efficiently explore and optimize Q/DQ insertion strategies.
 """
 
-# Expose Autotune modes and args
-from .__main__ import MODE_PRESETS, StoreWithExplicitFlag, get_node_filter_list
+# Expose Autotune modes
+from .__main__ import MODE_PRESETS
 
 # Core data structures
 from .autotuner import QDQAutotuner
@@ -45,6 +45,7 @@
 )
 from .region_pattern import RegionPattern
 from .region_search import CombinedRegionSearch
+from .utils import StoreWithExplicitFlag, get_node_filter_list
 
 __all__ = [
     "MODE_PRESETS",
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index a66585300..071ba6ceb 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -21,6 +21,11 @@
 from pathlib import Path
 
 from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.utils import (
+    StoreWithExplicitFlag,
+    get_node_filter_list,
+    validate_file_path,
+)
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
     region_pattern_autotuning_workflow,
@@ -44,20 +49,6 @@
 }
 
 
-class StoreWithExplicitFlag(argparse.Action):
-    """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
-
-    def __init__(self, explicit_attr: str, *args, **kwargs):
-        """Initialize explicit attribute flag."""
-        self._explicit_attr = explicit_attr
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        """Set attributes."""
-        setattr(namespace, self.dest, values)
-        setattr(namespace, self._explicit_attr, True)
-
-
 def apply_mode_presets(args) -> None:
     """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.
 
@@ -75,30 +66,6 @@ def apply_mode_presets(args) -> None:
         args.timing_runs = preset["timing_runs"]
 
 
-def validate_file_path(path: str | None, description: str) -> Path | None:
-    """Validate that a file path exists.
-
-    Args:
-        path: Path string to validate (can be None)
-        description: Description of the file for error messages
-
-    Returns:
-        Path object if valid, None if path is None
-
-    Raises:
-        SystemExit: If path is provided but doesn't exist
-    """
-    if path is None:
-        return None
-
-    path_obj = Path(path)
-    if not path_obj.exists():
-        logger.error(f"{description} not found: {path_obj}")
-        sys.exit(1)
-
-    return path_obj
-
-
 def log_benchmark_config(args):
     """Log TensorRT benchmark configuration for transparency.
 
@@ -118,27 +85,6 @@ def log_benchmark_config(args):
         logger.info(f"  Trtexec args: {args.trtexec_benchmark_args}")
 
 
-def get_node_filter_list(node_filter_list_path: str) -> list | None:
-    """Extract node filter list from node filters path.
-
-    Args:
-        node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line).
-
-    Returns:
-        Node filter list
-    """
-    node_filter_list = None
-    if node_filter_list_path:
-        filter_file = validate_file_path(node_filter_list_path, "Node filter list file")
-        if filter_file:
-            with open(filter_file) as f:
-                node_filter_list = [
-                    line.strip() for line in f if line.strip() and not line.strip().startswith("#")
-                ]
-            logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
-    return node_filter_list
-
-
 def run_autotune() -> int:
     """Execute the complete pattern-based Q/DQ autotuning workflow.
 
diff --git a/modelopt/onnx/quantization/autotune/utils.py b/modelopt/onnx/quantization/autotune/utils.py
new file mode 100644
index 000000000..8760b4bc1
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/utils.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions related to Autotune."""
+
+import argparse
+import sys
+from pathlib import Path
+
+from modelopt.onnx.logging_config import logger
+
+
+class StoreWithExplicitFlag(argparse.Action):
+    """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
+
+    def __init__(self, explicit_attr: str, *args, **kwargs):
+        """Initialize explicit attribute flag."""
+        self._explicit_attr = explicit_attr
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        """Set attributes."""
+        setattr(namespace, self.dest, values)
+        setattr(namespace, self._explicit_attr, True)
+
+
+def validate_file_path(path: str | None, description: str) -> Path | None:
+    """Validate that a file path exists.
+
+    Args:
+        path: Path string to validate (can be None)
+        description: Description of the file for error messages
+
+    Returns:
+        Path object if valid, None if path is None
+
+    Raises:
+        SystemExit: If path is provided but doesn't exist
+    """
+    if path is None:
+        return None
+
+    path_obj = Path(path)
+    if not path_obj.exists():
+        logger.error(f"{description} not found: {path_obj}")
+        sys.exit(1)
+
+    return path_obj
+
+
+def get_node_filter_list(node_filter_list_path: str) -> list | None:
+    """Extract node filter list from node filters path.
+
+    Args:
+        node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line).
+
+    Returns:
+        Node filter list
+    """
+    node_filter_list = None
+    if node_filter_list_path:
+        filter_file = validate_file_path(node_filter_list_path, "Node filter list file")
+        if filter_file:
+            with open(filter_file) as f:
+                node_filter_list = [
+                    line.strip() for line in f if line.strip() and not line.strip().startswith("#")
+                ]
+            logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+    return node_filter_list