diff --git a/modelopt/onnx/op_types.py b/modelopt/onnx/op_types.py
index 7e11d25e6..42085e18f 100644
--- a/modelopt/onnx/op_types.py
+++ b/modelopt/onnx/op_types.py
@@ -386,3 +386,25 @@ def get_symmetric_ops():
         "BitwiseOr",
         "BitwiseXor",
     }
+
+
+def get_activation_ops():
+    """Returns set of activation operations."""
+    return {
+        "Relu",
+        "LeakyRelu",
+        "PRelu",
+        "Elu",
+        "Selu",
+        "ThresholdedRelu",
+        "Sigmoid",
+        "Tanh",
+        "HardSigmoid",
+        "Softmax",
+        "LogSoftmax",
+        "Clip",
+        "Softplus",
+        "Softsign",
+        "Swish",
+        "HardSwish",
+    }
diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
index 6c79d9317..8a71291f1 100644
--- a/modelopt/onnx/quantization/__main__.py
+++ b/modelopt/onnx/quantization/__main__.py
@@ -20,6 +20,11 @@
 
 import numpy as np
 
+from modelopt.onnx.quantization.autotune import (
+    MODE_PRESETS,
+    StoreWithExplicitFlag,
+    get_node_filter_list,
+)
 from modelopt.onnx.quantization.quantize import quantize
 
 __all__ = ["main"]
@@ -295,9 +300,128 @@ def get_parser() -> argparse.ArgumentParser:
             "if certain operations require a higher version."
         ),
     )
+    argparser.add_argument(
+        "--autotune",
+        nargs="?",
+        const="default",
+        default=None,
+        choices=["quick", "default", "extensive"],
+        help=(
+            "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. "
+            "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
+            "  - 'quick': fewer schemes and benchmark runs for quick exploration; "
+            "  - 'default': balanced, recommended for most cases; "
+            "  - 'extensive': more schemes and runs for extensive search and thorough tuning. "
+            "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
+        ),
+    )
+
+    autotune_group = argparser.add_argument_group(
+        "Autotune (only applicable when --autotune is set)"
+    )
+    autotune_group.add_argument(
+        "--autotune_output_dir",
+        type=str,
+        default=None,
+        help="Output directory for autotune results (state file, logs). Default: temp directory.",
+    )
+    autotune_group.add_argument(
+        "--autotune_schemes_per_region",
+        type=int,
+        default=MODE_PRESETS["default"]["schemes_per_region"],
+        help="Number of Q/DQ schemes to test per region.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_schemes_per_region",
+    )
+    autotune_group.add_argument(
+        "--autotune_pattern_cache",
+        type=str,
+        default=None,
+        dest="autotune_pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_state_file",
+        type=str,
+        default=None,
+        help="State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).",
+    )
+    autotune_group.add_argument(
+        "--autotune_node_filter_list",
+        type=str,
+        default=None,
+        help=(
+            "Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+            "Regions without any matching nodes are skipped during autotuning."
+        ),
+    )
+    autotune_group.add_argument(
+        "--autotune_verbose",
+        action="store_true",
+        help="Enable verbose logging in the autotuner.",
+    )
+    autotune_group.add_argument(
+        "--autotune_use_trtexec",
+        action="store_true",
+        help="Use trtexec for benchmarking instead of the TensorRT Python API.",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_cache",
+        type=str,
+        default=None,
+        help="TensorRT timing cache file for faster engine builds.",
+    )
+    autotune_group.add_argument(
+        "--autotune_warmup_runs",
+        type=int,
+        default=MODE_PRESETS["default"]["warmup_runs"],
+        help="Number of warmup runs before timing.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_warmup_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_runs",
+        type=int,
+        default=MODE_PRESETS["default"]["timing_runs"],
+        help="Number of timed runs for latency measurement.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_timing_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_trtexec_args",
+        type=str,
+        default=None,
+        help=(
+            "Additional trtexec arguments as a single quoted string. "
+            "Example: --autotune_trtexec_args '--fp16 --workspace=4096'"
+        ),
+    )
     return argparser
 
 
+def apply_mode_presets(args) -> None:
+    """Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs.
+
+    Only applies preset for an option when that option was not explicitly set on the
+    command line (explicit flags override the preset).
+    """
+    if args.autotune not in MODE_PRESETS:
+        return
+    preset = MODE_PRESETS[args.autotune]
+    if not getattr(args, "_explicit_autotune_schemes_per_region", False):
+        args.autotune_schemes_per_region = preset["schemes_per_region"]
+    if not getattr(args, "_explicit_autotune_warmup_runs", False):
+        args.autotune_warmup_runs = preset["warmup_runs"]
+    if not getattr(args, "_explicit_autotune_timing_runs", False):
+        args.autotune_timing_runs = preset["timing_runs"]
+
+
 def main():
     """Command-line entrypoint for ONNX PTQ."""
     args = get_parser().parse_args()
@@ -331,6 +455,14 @@ def main():
             else:
                 raise
 
+    # Autotune configs
+    autotune_enabled = args.autotune is not None
+    if autotune_enabled:
+        apply_mode_presets(args)
+    autotune_node_filter_list = (
+        get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None
+    )
+
     quantize(
         args.onnx_path,
         quantize_mode=args.quantize_mode,
@@ -362,6 +494,19 @@ def main():
         calibrate_per_node=args.calibrate_per_node,
         direct_io_types=args.direct_io_types,
         opset=args.opset,
+        autotune=autotune_enabled,
+        autotune_output_dir=args.autotune_output_dir,
+        autotune_num_schemes_per_region=args.autotune_schemes_per_region,
+        autotune_pattern_cache_file=args.autotune_pattern_cache_file,
+        autotune_state_file=args.autotune_state_file,
+        autotune_qdq_baseline=args.autotune_qdq_baseline,
+        autotune_node_filter_list=autotune_node_filter_list,
+        autotune_verbose=args.autotune_verbose,
+        autotune_use_trtexec=args.autotune_use_trtexec,
+        autotune_timing_cache=args.autotune_timing_cache,
+        autotune_warmup_runs=args.autotune_warmup_runs,
+        autotune_timing_runs=args.autotune_timing_runs,
+        autotune_trtexec_args=args.autotune_trtexec_args,
     )
 
 
diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py
index 7f14bb360..74f44f972 100644
--- a/modelopt/onnx/quantization/autotune/__init__.py
+++ b/modelopt/onnx/quantization/autotune/__init__.py
@@ -20,6 +20,9 @@
 region analysis to efficiently explore and optimize Q/DQ insertion strategies.
 """
 
+# Expose Autotune modes
+from .__main__ import MODE_PRESETS
+
 # Core data structures
 from .autotuner import QDQAutotuner
 from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
@@ -42,8 +45,10 @@
 )
 from .region_pattern import RegionPattern
 from .region_search import CombinedRegionSearch
+from .utils import StoreWithExplicitFlag, get_node_filter_list
 
 __all__ = [
+    "MODE_PRESETS",
     "AutotunerError",
     "AutotunerNotInitializedError",
     "ChildRegionInputInsertionPoint",
@@ -60,6 +65,8 @@
     "RegionPattern",
     "RegionType",
     "ResolvedInsertionPoint",
+    "StoreWithExplicitFlag",
     "TensorRTPyBenchmark",
     "TrtExecBenchmark",
+    "get_node_filter_list",
 ]
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
index cb7b3c281..071ba6ceb 100644
--- a/modelopt/onnx/quantization/autotune/__main__.py
+++ b/modelopt/onnx/quantization/autotune/__main__.py
@@ -21,6 +21,11 @@
 from pathlib import Path
 
 from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.utils import (
+    StoreWithExplicitFlag,
+    get_node_filter_list,
+    validate_file_path,
+)
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
     region_pattern_autotuning_workflow,
@@ -44,18 +49,6 @@
 }
 
 
-class _StoreWithExplicitFlag(argparse.Action):
-    """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
-
-    def __init__(self, explicit_attr: str, *args, **kwargs):
-        self._explicit_attr = explicit_attr
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, values)
-        setattr(namespace, self._explicit_attr, True)
-
-
 def apply_mode_presets(args) -> None:
     """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.
 
@@ -73,30 +66,6 @@ def apply_mode_presets(args) -> None:
         args.timing_runs = preset["timing_runs"]
 
 
-def validate_file_path(path: str | None, description: str) -> Path | None:
-    """Validate that a file path exists.
-
-    Args:
-        path: Path string to validate (can be None)
-        description: Description of the file for error messages
-
-    Returns:
-        Path object if valid, None if path is None
-
-    Raises:
-        SystemExit: If path is provided but doesn't exist
-    """
-    if path is None:
-        return None
-
-    path_obj = Path(path)
-    if not path_obj.exists():
-        logger.error(f"{description} not found: {path_obj}")
-        sys.exit(1)
-
-    return path_obj
-
-
 def log_benchmark_config(args):
     """Log TensorRT benchmark configuration for transparency.
 
@@ -155,20 +124,9 @@ def run_autotune() -> int:
         return 1
 
     try:
-        node_filter_list = None
-        if args.node_filter_list:
-            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
-            if filter_file:
-                with open(filter_file) as f:
-                    node_filter_list = [
-                        line.strip()
-                        for line in f
-                        if line.strip() and not line.strip().startswith("#")
-                    ]
-                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
-
+        node_filter_list = get_node_filter_list(args.node_filter_list)
         region_pattern_autotuning_workflow(
-            model_path=str(model_path),
+            model_or_path=str(model_path),
             output_dir=output_dir,
             num_schemes_per_region=args.num_schemes,
             pattern_cache_file=args.pattern_cache_file,
@@ -262,7 +220,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         type=int,
         default=DEFAULT_NUM_SCHEMES,
         dest="num_schemes",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_num_schemes",
         help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
     )
@@ -328,7 +286,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--warmup_runs",
         type=int,
         default=DEFAULT_WARMUP_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_warmup_runs",
         help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
     )
@@ -336,7 +294,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
         "--timing_runs",
         type=int,
         default=DEFAULT_TIMING_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_timing_runs",
         help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
     )
diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py
index a519d7c61..6df297e95 100644
--- a/modelopt/onnx/quantization/autotune/autotuner_base.py
+++ b/modelopt/onnx/quantization/autotune/autotuner_base.py
@@ -35,7 +35,7 @@
 import yaml
 
 from modelopt.onnx.logging_config import logger
-from modelopt.onnx.op_types import is_linear_op
+from modelopt.onnx.op_types import get_activation_ops, is_linear_op
 from modelopt.onnx.quantization.autotune.common import (
     AutotunerNotInitializedError,
     Config,
@@ -46,7 +46,10 @@
     Region,
 )
 from modelopt.onnx.quantization.autotune.export_utils import export_qdq_onnx
-from modelopt.onnx.quantization.autotune.insertion_points import ResolvedInsertionPoint
+from modelopt.onnx.quantization.autotune.insertion_points import (
+    ResolvedInsertionPoint,
+    get_autotuner_quantizable_ops,
+)
 from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern
 from modelopt.onnx.quantization.graph_utils import get_tensor_consumer_node_indices
 
@@ -434,6 +437,125 @@ def _exclude_overlapping_insertion_points(
         if all_region_ips:
             logger.debug(f"  → Excluded {len(all_region_ips)} overlapping insertion points")
 
+    @_requires_init
+    def get_resolved_insertion_points(
+        self, best: bool = True, verbose: bool = False
+    ) -> set[ResolvedInsertionPoint]:
+        """Compute Q/DQ insertion points for the best schemes (assuming best=True).
+
+        Args:
+            best: If True, use the best scheme for each region. If False, use the current scheme.
+            verbose: If True, log matched-region counts and per-region insertion point details.
+
+        Returns:
+            Set of ResolvedInsertionPoint objects representing where Q/DQ pairs should be inserted.
+
+        Raises:
+            AutotunerNotInitializedError: If initialize() hasn't been called
+        """
+        resolved_insertion_points: set[ResolvedInsertionPoint] = set()
+        matched_regions = 0
+
+        if verbose:
+            logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions")
+
+        for region in self.regions:
+            current_scheme, pattern = self._resolve_scheme_for_region(region, best)
+            if current_scheme is None:
+                continue
+            self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern)
+            new_insertion_points = pattern.matches(region, self.graph, current_scheme)
+            if new_insertion_points:
+                resolved_insertion_points.update(new_insertion_points)
+                matched_regions += 1
+                if verbose:
+                    logger.debug(f"  → Added {len(new_insertion_points)} insertion points")
+        if verbose:
+            logger.debug(
+                f"Matched {matched_regions}/{len(self.regions)} regions, "
+                f"total {len(resolved_insertion_points)} unique insertion points"
+            )
+        return resolved_insertion_points
+
+    @_requires_init
+    def get_ort_quantization_config(
+        self,
+    ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
+        """Derive ORT quantization configuration from resolved insertion points.
+
+        Returns the four parameters consumed by INT8 and FP8 quantize() to replicate the autotuner's
+        Q/DQ placement decisions without exporting any intermediate ONNX file to disk.
+
+        Returns:
+            nodes_to_quantize: Node names that have at least one covered Q/DQ input.
+            op_types_to_quantize: Op types eligible for quantization.
+            no_quantize_inputs: List of (src_node, dst_node, tensor_name) tuples for inputs
+              of quantized nodes that should NOT receive Q/DQ.
+            op_types_needing_output_quant: Producer op types whose output feeds a covered
+              activation-op input (needed so ORT inserts Q/DQ between e.g. Add and Relu).
+
+        Raises:
+            AutotunerNotInitializedError: If initialize() hasn't been called.
+        """
+        resolved_ips = self.get_resolved_insertion_points(best=True)
+        graph = self.graph
+
+        # Build (node_index, input_index) pairs that have Q/DQ
+        covered: set[tuple[int, int]] = set()
+        for ip in resolved_ips:
+            if ip.node_index is not None and ip.input_index is not None:
+                covered.add((ip.node_index, ip.input_index))
+            else:
+                # Tensor-level insertion point: expand to all consumer (node, input) pairs
+                for consumer_idx in graph.tensor_users_map.get(ip.tensor_name, []):
+                    node = graph.nodes[consumer_idx]
+                    for inp_idx, inp in enumerate(node.inputs):
+                        if getattr(inp, "name", None) == ip.tensor_name:
+                            covered.add((consumer_idx, inp_idx))
+
+        # Nodes that consume a covered (DQ-fed) input
+        quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered}
+
+        # Also include producer nodes of covered inputs: a producer whose output feeds a
+        # covered slot needs to be in nodes_to_quantize so ORT can place Q on its output
+        # (e.g., Add must be included when Q/DQ sits between Add and Relu).
+        node_name_to_idx = {node.name: i for i, node in enumerate(graph.nodes)}
+        for node_idx, inp_idx in covered:
+            tensor = graph.nodes[node_idx].inputs[inp_idx]
+            if tensor.inputs:
+                producer_idx = node_name_to_idx.get(tensor.inputs[0].name)
+                if producer_idx is not None:
+                    quantized_node_indices.add(producer_idx)
+
+        nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices]
+        op_types_to_quantize = list(get_autotuner_quantizable_ops())
+
+        # Inputs of quantized nodes NOT covered by Q/DQ (only non-constant producer inputs)
+        no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] = []
+        for node_idx in quantized_node_indices:
+            node = graph.nodes[node_idx]
+            for inp_idx, inp in enumerate(node.inputs):
+                if (node_idx, inp_idx) not in covered and getattr(inp, "name", None):
+                    if inp.inputs:
+                        no_quantize_inputs.append((inp.inputs[0], node, inp.name))
+
+        # Producer op types whose output feeds a covered activation-op input
+        # (e.g., to support Add->Q/DQ->Relu patterns)
+        op_types_needing_output_quant: set[str] = set()
+        for node_idx, inp_idx in covered:
+            node = graph.nodes[node_idx]
+            if node.op in get_activation_ops():
+                tensor = node.inputs[inp_idx]
+                if tensor.inputs:
+                    op_types_needing_output_quant.add(tensor.inputs[0].op)
+
+        return (
+            nodes_to_quantize,
+            op_types_to_quantize,
+            no_quantize_inputs,
+            list(op_types_needing_output_quant),
+        )
+
     @_requires_init
     def export_onnx(
         self, output_path: str | None = None, insert_qdq: bool = True, best: bool = False
@@ -469,29 +591,7 @@ def export_onnx(
         )
 
         if insert_qdq:
-            matched_regions = 0
-
-            logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions")
-
-            for region in self.regions:
-                current_scheme, pattern = self._resolve_scheme_for_region(region, best)
-                if current_scheme is None:
-                    continue
-
-                self._exclude_overlapping_insertion_points(
-                    resolved_insertion_points, region, pattern
-                )
-
-                new_ips = pattern.matches(region, self.graph, current_scheme)
-                if new_ips:
-                    resolved_insertion_points.update(new_ips)
-                    matched_regions += 1
-                    logger.debug(f"  → Added {len(new_ips)} insertion points")
-
-            logger.debug(
-                f"Matched {matched_regions}/{len(self.regions)} regions, "
-                f"total {len(resolved_insertion_points)} unique insertion points"
-            )
+            resolved_insertion_points = self.get_resolved_insertion_points(best=best, verbose=True)
 
         unique_tensors = len(resolved_insertion_points)
 
diff --git a/modelopt/onnx/quantization/autotune/utils.py b/modelopt/onnx/quantization/autotune/utils.py
new file mode 100644
index 000000000..8760b4bc1
--- /dev/null
+++ b/modelopt/onnx/quantization/autotune/utils.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions related to Autotune."""
+
+import argparse
+import sys
+from pathlib import Path
+
+from modelopt.onnx.logging_config import logger
+
+
+class StoreWithExplicitFlag(argparse.Action):
+    """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
+
+    def __init__(self, explicit_attr: str, *args, **kwargs):
+        """Initialize explicit attribute flag."""
+        self._explicit_attr = explicit_attr
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        """Set attributes."""
+        setattr(namespace, self.dest, values)
+        setattr(namespace, self._explicit_attr, True)
+
+
+def validate_file_path(path: str | None, description: str) -> Path | None:
+    """Validate that a file path exists.
+
+    Args:
+        path: Path string to validate (can be None)
+        description: Description of the file for error messages
+
+    Returns:
+        Path object if valid, None if path is None
+
+    Raises:
+        SystemExit: If path is provided but doesn't exist
+    """
+    if path is None:
+        return None
+
+    path_obj = Path(path)
+    if not path_obj.exists():
+        logger.error(f"{description} not found: {path_obj}")
+        sys.exit(1)
+
+    return path_obj
+
+
+def get_node_filter_list(node_filter_list_path: str) -> list | None:
+    """Extract node filter list from node filters path.
+
+    Args:
+        node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line).
+
+    Returns:
+        Node filter list
+    """
+    node_filter_list = None
+    if node_filter_list_path:
+        filter_file = validate_file_path(node_filter_list_path, "Node filter list file")
+        if filter_file:
+            with open(filter_file) as f:
+                node_filter_list = [
+                    line.strip() for line in f if line.strip() and not line.strip().startswith("#")
+                ]
+            logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
+    return node_filter_list
diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py
index 025d9fac4..190882a31 100644
--- a/modelopt/onnx/quantization/autotune/workflows.py
+++ b/modelopt/onnx/quantization/autotune/workflows.py
@@ -20,6 +20,8 @@
 """
 
 import fnmatch
+import shutil
+import tempfile
 from pathlib import Path
 
 import onnx
@@ -158,8 +160,8 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool:
 
 
 def region_pattern_autotuning_workflow(
-    model_path: str,
-    output_dir: Path,
+    model_or_path: str | onnx.ModelProto,
+    output_dir: Path | None = None,
     num_schemes_per_region: int = 30,
     pattern_cache_file: str | None = None,
     state_file: str | None = None,
@@ -195,8 +197,8 @@ def region_pattern_autotuning_workflow(
     7. Export final optimized model with best Q/DQ scheme for each pattern
 
     Args:
-        model_path: Path to ONNX model file to optimize
-        output_dir: Directory for output files (state, logs, models). Created if doesn't exist.
+        model_or_path: Path to ONNX model file to optimize
+        output_dir: Directory for output files (state, logs, models). Created if it doesn't exist.
         num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern.
                                Higher values explore more configurations but take longer (default: 30)
         pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes
@@ -205,6 +207,7 @@ def region_pattern_autotuning_workflow(
                    uses <output_dir>/autotuner_state.yaml (default: None)
         quant_type: Quantization data type - "int8" for INT8 quantization (default),
                    "fp8" for FP8 quantization
+        default_dq_dtype: Dtype for DequantizeLinear output; "float32" (default) or "float16".
         qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided,
                            extracts Q/DQ insertion patterns and adds them to pattern cache
                            for warm-start (default: None)
@@ -215,6 +218,10 @@ def region_pattern_autotuning_workflow(
     Returns:
         QDQAutotuner instance after autotuning
     """
+    output_dir_is_temp = output_dir is None
+    if not output_dir:
+        output_dir = Path(tempfile.mkdtemp())
+
     output_dir.mkdir(parents=True, exist_ok=True)
     logs_dir = output_dir / "logs"
     logs_dir.mkdir(exist_ok=True)
@@ -225,8 +232,11 @@ def region_pattern_autotuning_workflow(
         state_file = str(output_dir / "autotuner_state.yaml")
     state_path = Path(state_file)
 
-    logger.info(f"Loading model: {model_path}")
-    model = onnx.load(model_path)
+    if isinstance(model_or_path, str):
+        logger.info(f"Loading model: {model_or_path}")
+        model = onnx.load(model_or_path)
+    else:
+        model = model_or_path
 
     pattern_cache = None
     if pattern_cache_file:
@@ -373,4 +383,9 @@ def region_pattern_autotuning_workflow(
     logger.debug(f"  Logs: {logs_dir}")
     logger.debug(f"  Region models: {models_dir}")
 
+    # Remove temporary folder
+    if output_dir_is_temp and output_dir.exists():
+        shutil.rmtree(output_dir)
+        logger.info(f"Temporary directory {output_dir} was deleted!")
+
     return autotuner
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
index 76a3e8167..b7146173a 100755
--- a/modelopt/onnx/quantization/fp8.py
+++ b/modelopt/onnx/quantization/fp8.py
@@ -183,6 +183,7 @@ def quantize(
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies FP8 GEMM only quantization to an ONNX file.
@@ -215,10 +216,12 @@ def quantize(
     op_types_to_quantize.extend(list(custom_ops_to_quantize))
 
     enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True)
-    if enable_gemv_detection_for_trt:
+    if enable_gemv_detection_for_trt and not autotune:
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
+        # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements
+        # will be decided according to TensorRT's runtime measurements.
         logger.info("Detecting GEMV patterns for TRT optimization")
         matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude(
             onnx_path,
@@ -233,7 +236,8 @@ def quantize(
 
     # Collect node names to exclude from quantization
     nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude)  # type: ignore[arg-type]
-    nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8"))
+    if not autotune:
+        nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8"))
 
     # Change the default configuration of ORT quantization
     op_types = {node.op for node in graph.nodes}
@@ -244,19 +248,22 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
+        kwargs.get("op_types_needing_output_quant"),
     )
     logger.info(
         f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}"
     )
 
     # Collect node names to include in quantization
-    no_quantize_inputs = []
-    nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
-    if not nodes_to_quantize:
-        quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
-            graph, quantizable_op_types, nodes_to_exclude
-        )
-        nodes_to_quantize = [node.name for node in quantizable_nodes]
+    nodes_to_quantize = nodes_to_quantize or []
+    no_quantize_inputs = kwargs.get("no_quantize_inputs", [])
+    if not autotune:
+        nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
+        if not nodes_to_quantize:
+            quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
+                graph, quantizable_op_types, nodes_to_exclude
+            )
+            nodes_to_quantize = [node.name for node in quantizable_nodes]
 
     # Update the list of nodes to quantize
     nodes_to_quantize = [
diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
index efa77dd7b..131723e61 100755
--- a/modelopt/onnx/quantization/graph_utils.py
+++ b/modelopt/onnx/quantization/graph_utils.py
@@ -616,16 +616,37 @@ def remove_partial_input_qdq(
             # Reached end of the graph
             continue
         if dq_node.op == "DequantizeLinear":
-            dq_node = dq_node.outputs[0]  # source_node->Q->DQ->target_node0
+            dq_output = dq_node.outputs[0]  # source_node->Q->DQ->target_node
+
+            # Look up the specific target node in the quantized graph.
+            # With DedicatedQDQPair=False, a shared Q/DQ pair may feed multiple consumers
+            # (e.g. Conv activation AND Add residual). Always patch the intended target
+            # rather than the first consumer of the DQ output to avoid removing Q/DQ from
+            # the wrong branch.
+            target_node_in_graph = graph_nodes.get(target.name)
+            if target_node_in_graph is None:
+                continue
 
-            # Find the input index in the target connecting with source_node
+            # Find the input index in the target that is connected to the DQ output
             target_input_idx_arr = [
-                idx for idx, inp in enumerate(dq_node.outputs[0].inputs) if inp.name == dq_node.name
+                idx
+                for idx, inp in enumerate(target_node_in_graph.inputs)
+                if inp.name == dq_output.name
             ]
-            target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0
+            # If no input index is found (dq_output is not actually connected to target node), skip rewiring to
+            # prevent silent corruption of the graph.
+            if not target_input_idx_arr:
+                logger.warning(
+                    "Expected DequantizeLinear output '%s' to be an input of node '%s', "
+                    "but no matching input was found. Skipping Q/DQ bypass for this edge.",
+                    dq_output.name,
+                    target_node_in_graph.name,
+                )
+                continue
+            target_input_idx = target_input_idx_arr[0]
 
-            # Connect the output of source_node with the output of DQ
-            dq_node.outputs[0].inputs[target_input_idx] = source_node.outputs[0]
+            # Connect the target's input directly to source_node's output (bypass Q/DQ)
+            target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0]
 
     # Check for quantized residual Adds where the parallel branch is not being quantized
     for source, target, non_qdq_input_name in no_quantize_inputs:
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
index 6e350a16f..ad2ca9558 100755
--- a/modelopt/onnx/quantization/int8.py
+++ b/modelopt/onnx/quantization/int8.py
@@ -133,6 +133,7 @@ def quantize(
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
     **kwargs,
 ) -> onnx.ModelProto:
     """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics.
@@ -157,10 +158,12 @@ def quantize(
         return onnx_model
 
     enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True)
-    if enable_gemv_detection_for_trt:
+    if enable_gemv_detection_for_trt and not autotune:
         # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores.
         # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case,
         # do not add Q/DQ layers to this matmul.
+        # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements
+        # will be decided according to TensorRT's runtime measurements.
         logger.info("Detecting GEMV patterns for TRT optimization")
         matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude(
             onnx_path,
@@ -175,7 +178,8 @@ def quantize(
 
     # Collect node names to exclude from quantization
     nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude)  # type: ignore[arg-type]
-    nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8"))
+    if not autotune:
+        nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8"))
 
     # Change the default configuration of ORT quantization
     op_types_to_quantize = op_types_to_quantize or []
@@ -189,22 +193,27 @@ def quantize(
         calibration_eps,
         calibrate_per_node,
         custom_ops_to_quantize,
+        kwargs.get("op_types_needing_output_quant"),
     )
     logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}")
 
     # Collect node names to include in quantization
-    no_quantize_inputs = []
-    nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
-    if not nodes_to_quantize:
-        # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list
-        nodes_to_quantize = [node.name for node in graph.nodes if node.op in op_types_to_quantize]
-
-        # If op_types_to_quantize is not provided, use default QDQ placement algorithm
+    nodes_to_quantize = nodes_to_quantize or []
+    no_quantize_inputs = kwargs.get("no_quantize_inputs", [])
+    if not autotune:
+        nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize)
         if not nodes_to_quantize:
-            quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
-                graph, quantizable_op_types, nodes_to_exclude
-            )
-            nodes_to_quantize = [node.name for node in quantizable_nodes]
+            # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list
+            nodes_to_quantize = [
+                node.name for node in graph.nodes if node.op in op_types_to_quantize
+            ]
+
+            # If op_types_to_quantize is not provided, use default QDQ placement algorithm
+            if not nodes_to_quantize:
+                quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize(
+                    graph, quantizable_op_types, nodes_to_exclude
+                )
+                nodes_to_quantize = [node.name for node in quantizable_nodes]
 
     # Read the calibration cache and quantize nodes for which activation scale values are cached
     if calibration_cache_path:
@@ -220,7 +229,8 @@ def quantize(
         logger.info(
             f"Skipping quantization of nodes: {set(nodes_to_quantize) - set(iq_quantized_nodes)}"
         )
-        nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes))
+        if not autotune:
+            nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes))
 
     # Update the list of nodes to quantize
     nodes_to_quantize = [
diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py
index 5c89e20d7..173fbb06d 100755
--- a/modelopt/onnx/quantization/ort_utils.py
+++ b/modelopt/onnx/quantization/ort_utils.py
@@ -271,6 +271,7 @@ def configure_ort(
     calibration_eps: list[str] | None = None,
     calibrate_per_node: bool = False,
     custom_ops_to_quantize: list[str] = [],
+    op_types_needing_output_quant: list[str] | None = None,
 ):
     """Configure and patches ORT to support ModelOpt ONNX quantization."""
     logger.info("Configuring ORT for ModelOpt ONNX quantization")
@@ -291,7 +292,7 @@ def configure_ort(
 
     # Remove copy, reduction and activation ops from ORT QDQ registry
     logger.debug("Removing non-quantizable ops from QDQ registry")
-    for op_type in [
+    for op_type in {
         "ArgMax",
         "Concat",
         "EmbedLayerNormalization",
@@ -311,7 +312,7 @@ def configure_ort(
         "Transpose",
         "Unsqueeze",
         "Where",
-    ]:
+    } - set(op_types_to_quantize):
         if op_type in QLinearOpsRegistry:
             del QLinearOpsRegistry[op_type]
         if op_type in QDQRegistry:
@@ -319,7 +320,10 @@ def configure_ort(
 
     # Prepare TensorRT friendly quantization settings
     no_output_quantization_op_types = [
-        op_type for op_type in op_types if op_type not in custom_ops_to_quantize
+        op_type
+        for op_type in op_types
+        if op_type not in custom_ops_to_quantize
+        and op_type not in (op_types_needing_output_quant or [])
     ]
     if trt_extra_plugin_lib_paths is not None:
         trt_extra_plugin_lib_paths = ";".join(trt_extra_plugin_lib_paths)
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
index da7ff126d..b53904657 100755
--- a/modelopt/onnx/quantization/quantize.py
+++ b/modelopt/onnx/quantization/quantize.py
@@ -36,6 +36,7 @@
 import shutil
 import tempfile
 from collections.abc import Sequence
+from pathlib import Path
 from typing import Any
 
 import onnx
@@ -45,6 +46,14 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op
+
+try:
+    from modelopt.onnx.quantization.autotune.workflows import (
+        init_benchmark_instance,
+        region_pattern_autotuning_workflow,
+    )
+except ImportError:
+    logger.warning("Failed to import Autotune dependencies")
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -242,6 +251,54 @@ def _preprocess_onnx(
     )
 
 
+def _find_nodes_to_quantize_autotune(
+    onnx_model: onnx.ModelProto,
+    quantize_mode: str,
+    trt_plugins: list[str] | None,
+    high_precision_dtype: str = "fp16",
+    output_dir: str | None = None,
+    num_schemes_per_region: int = 50,
+    pattern_cache_file: str | None = None,
+    state_file: str | None = None,
+    qdq_baseline_model: str | None = None,
+    node_filter_list: list[str] | None = None,
+    verbose: bool = False,
+    use_trtexec: bool = False,
+    timing_cache_file: str | None = None,
+    warmup_runs: int = 50,
+    timing_runs: int = 100,
+    trtexec_args: str | None = None,
+) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]:
+    """Extracts quantization information from Autotune to provide ORT quantization."""
+    logger.info("Running Auto Q/DQ with TensorRT")
+
+    benchmark_instance = init_benchmark_instance(
+        use_trtexec=use_trtexec,
+        plugin_libraries=trt_plugins,
+        timing_cache_file=timing_cache_file,
+        warmup_runs=warmup_runs,
+        timing_runs=timing_runs,
+        trtexec_args=trtexec_args.split() if trtexec_args else None,
+    )
+    if benchmark_instance is None:
+        raise RuntimeError("Failed to initialize TensorRT benchmark")
+
+    precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"}
+    autotuner = region_pattern_autotuning_workflow(
+        onnx_model,
+        output_dir=Path(output_dir) if output_dir else None,
+        num_schemes_per_region=num_schemes_per_region,
+        pattern_cache_file=pattern_cache_file,
+        state_file=state_file,
+        quant_type=quantize_mode,
+        default_dq_dtype=precision_map[high_precision_dtype],
+        qdq_baseline_model=qdq_baseline_model,
+        node_filter_list=node_filter_list,
+        verbose=verbose,
+    )
+    return autotuner.get_ort_quantization_config()
+
+
 def quantize(
     onnx_path: str,
     quantize_mode: str = "int8",
@@ -275,6 +332,19 @@ def quantize(
     input_shapes_profile: Sequence[dict[str, str]] | None = None,
     direct_io_types: bool = False,
     opset: int | None = None,
+    autotune: bool = False,
+    autotune_output_dir: str | None = None,
+    autotune_num_schemes_per_region: int = 50,
+    autotune_pattern_cache_file: str | None = None,
+    autotune_state_file: str | None = None,
+    autotune_qdq_baseline: str | None = None,
+    autotune_node_filter_list: list[str] | None = None,
+    autotune_verbose: bool = False,
+    autotune_use_trtexec: bool = False,
+    autotune_timing_cache: str | None = None,
+    autotune_warmup_runs: int = 50,
+    autotune_timing_runs: int = 100,
+    autotune_trtexec_args: str | None = None,
     **kwargs: Any,
 ) -> None:
     """Quantizes the provided ONNX model.
@@ -398,6 +468,35 @@ def quantize(
             Target ONNX opset version for the quantized model. If None, uses required minimum opset
             (19 for int8/fp8, 21 for int4, 23 for nvfp4). If the specified opset is lower than the required minimum,
             a warning will be issued and the opset will be upgraded to the required minimum.
+        autotune:
+            If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available.
+            If False, use the default pattern-based quantization approach.
+        autotune_output_dir:
+            Output directory for autotune results (state file, logs). Default: temp directory.
+        autotune_num_schemes_per_region:
+            Number of Q/DQ schemes to test per region.
+        autotune_pattern_cache_file:
+            Path to pattern cache YAML for warm-start.
+        autotune_qdq_baseline:
+            Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.
+        autotune_state_file:
+            State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).
+        autotune_node_filter_list:
+            Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). Regions without
+            any matching nodes are skipped during autotuning.
+        autotune_verbose:
+            Enable verbose logging in the autotuner.
+        autotune_use_trtexec:
+            Use trtexec for benchmarking instead of the TensorRT Python API.
+        autotune_timing_cache:
+            TensorRT timing cache file for faster engine builds.
+        autotune_warmup_runs:
+            Number of warmup runs before timing.
+        autotune_timing_runs:
+            Number of timed runs for latency measurement.
+        autotune_trtexec_args:
+            Additional trtexec arguments as a single quoted string.
+            Example: --autotune_trtexec_args '--fp16 --workspace=4096'
         kwargs:
             Additional keyword arguments for int4 quantization, including:
             - awqlite_alpha_step (float): Alpha step for lite, range [0, 1].
@@ -506,6 +605,35 @@ def quantize(
         calibration_shapes = get_input_shapes(onnx_path)
 
     if quantize_mode in ["fp8", "int8"]:
+        if autotune:
+            (
+                nodes_to_quantize_autotune,
+                op_types_to_quantize_autotune,
+                no_quantize_inputs,
+                op_types_needing_output_quant,
+            ) = _find_nodes_to_quantize_autotune(
+                onnx_model,
+                quantize_mode,
+                trt_plugins,
+                high_precision_dtype,
+                output_dir=autotune_output_dir,
+                num_schemes_per_region=autotune_num_schemes_per_region,
+                pattern_cache_file=autotune_pattern_cache_file,
+                state_file=autotune_state_file,
+                qdq_baseline_model=autotune_qdq_baseline,
+                node_filter_list=autotune_node_filter_list,
+                verbose=autotune_verbose,
+                use_trtexec=autotune_use_trtexec,
+                timing_cache_file=autotune_timing_cache,
+                warmup_runs=autotune_warmup_runs,
+                timing_runs=autotune_timing_runs,
+                trtexec_args=autotune_trtexec_args,
+            )
+            op_types_to_quantize = op_types_to_quantize or op_types_to_quantize_autotune
+            nodes_to_quantize = nodes_to_quantize or nodes_to_quantize_autotune
+            kwargs["no_quantize_inputs"] = no_quantize_inputs
+            kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant
+
         quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8
         onnx_model = quantize_func(
             onnx_path=onnx_path,
@@ -531,8 +659,10 @@ def quantize(
             custom_ops_to_quantize=list(custom_ops_to_quantize.keys()),
             direct_io_types=direct_io_types,
             opset=opset,
+            autotune=autotune,
             **kwargs,
         )
+
     elif "int4" in quantize_mode:
         onnx_model = quantize_int4(
             onnx_path=onnx_path,
diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py
index fc63f6690..84a8b4ab8 100644
--- a/tests/_test_utils/onnx/quantization/autotune/models.py
+++ b/tests/_test_utils/onnx/quantization/autotune/models.py
@@ -20,6 +20,8 @@
 """
 
 import onnx
+import torch
+import torch.nn as nn
 from onnx import helper
 
 
@@ -52,3 +54,42 @@ def _create_simple_conv_onnx_model():
         ],
     )
     return helper.make_model(graph, producer_name="test")
+
+
+def _create_simple_resnet18_model():
+    """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests.
+
+    Architecture:
+        Conv(3→64, 7×7, stride=2) → ReLU → MaxPool(3×3, stride=2)
+        → BasicBlock(64→64) → BasicBlock(64→64)
+
+    Input shape: [1, 3, 1024, 1024], output shape: [1, 64, 256, 256].
+    """
+
+    class _BasicBlock(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(64, 64, 3, padding=1, bias=True)
+            self.act1 = nn.ReLU()
+            self.conv2 = nn.Conv2d(64, 64, 3, padding=1, bias=True)
+            self.act2 = nn.ReLU()
+
+        def forward(self, x):
+            return self.act2(self.conv2(self.act1(self.conv1(x))) + x)
+
+    class _Model(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=True)
+            self.act1 = nn.ReLU()
+            self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
+            self.layer1 = nn.Sequential(_BasicBlock(), _BasicBlock())
+
+        def forward(self, x):
+            return self.layer1(self.maxpool(self.act1(self.conv1(x))))
+
+    torch.manual_seed(42)
+    model = _Model().eval()
+    input_tensor = torch.zeros(1, 3, 1024, 1024)
+
+    return model, input_tensor
diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
new file mode 100644
index 000000000..829eebb55
--- /dev/null
+++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from unittest.mock import patch
+
+import onnx
+import onnx_graphsurgeon as gs
+from _test_utils.import_helper import skip_if_no_tensorrt
+from _test_utils.onnx.lib_test_models import export_as_onnx
+from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_model
+
+from modelopt.onnx.quantization.autotune.workflows import (
+    init_benchmark_instance,
+    region_pattern_autotuning_workflow,
+)
+from modelopt.onnx.quantization.quantize import _preprocess_onnx, quantize
+
+skip_if_no_tensorrt()
+
+
+def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]:
+    """Return (node_name, input_index) for every DQ-fed input slot in the model."""
+    graph = gs.import_onnx(model)
+    return {
+        (node.name, inp_idx)
+        for node in graph.nodes
+        for inp_idx, inp in enumerate(node.inputs)
+        if inp.inputs and inp.inputs[0].op == "DequantizeLinear"
+    }
+
+
+def _collect_q_scales(model: onnx.ModelProto) -> dict[str, float]:
+    """Return {scale_initializer_name: float_value} for every QuantizeLinear node.
+
+    Works for both float32 and float16 scale initializers (the latter produced by
+    the fp16-conversion pass that runs after ORT calibration).
+    """
+    initializers = {init.name: init for init in model.graph.initializer}
+    scales = {}
+    for node in model.graph.node:
+        if node.op_type == "QuantizeLinear" and len(node.input) >= 2:
+            scale_name = node.input[1]
+            if scale_name in initializers:
+                raw = onnx.numpy_helper.to_array(initializers[scale_name])
+                scales[scale_name] = float(raw.flat[0])
+    return scales
+
+
+def test_autotune_quantization_integration(tmp_path):
+    """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune.
+
+    Also ensure that the scales in the Q/DQ nodes have been updated from standalone Autotune to MOQ with Autotune.
+
+    Runs the autotuner once to obtain a fixed set of insertion points. The same autotuner instance is then injected
+    into quantize() via patching so that both sides reflect identical placement decisions without a second TRT
+    profiling run.
+
+    Compares the set of (node_name, input_index) pairs where a DQ node feeds the input between:
+    - the autotuner's own export (via export_onnx), and
+    - the quantize(autotune=True) output model.
+    """
+    model_torch, input_tensor = _create_simple_resnet18_model()
+    onnx_path = os.path.join(tmp_path, "model.onnx")
+    output_path = onnx_path.replace(".onnx", ".quant.onnx")
+
+    # Export torch model to ONNX
+    export_as_onnx(model_torch, input_tensor, onnx_filename=onnx_path)
+
+    # Load and pre-process ONNX
+    onnx_path, onnx_model, *_ = _preprocess_onnx(
+        onnx_path,
+        use_external_data_format=False,
+        output_path=output_path,
+        enable_shared_constants_duplication=True,
+        trt_plugins=None,
+        trt_plugins_precision=None,
+        override_shapes=None,  # type: ignore[arg-type]
+        quantize_mode="int8",
+    )
+
+    # Run autotune once to get a determined set of placement decisions.
+    init_benchmark_instance(use_trtexec=False)
+    autotuner = region_pattern_autotuning_workflow(
+        onnx_model,
+        quant_type="int8",
+        default_dq_dtype="float16",
+    )
+
+    # Autotune path: export the Q/DQ model directly and collect quantized tensor slots.
+    autotune_model = onnx.load_from_string(autotuner.export_onnx(best=True))
+    autotune_tensors = _quantized_tensor_indices(autotune_model)
+
+    # MOQ + Autotune path: inject the same autotuner so placement decisions are identical,
+    # then run the full quantize() pipeline and collect quantized tensor slots.
+    with patch(
+        "modelopt.onnx.quantization.quantize.region_pattern_autotuning_workflow",
+        return_value=autotuner,
+    ):
+        quantize(onnx_path, autotune=True, output_path=output_path)
+
+    # Check Q/DQ nodes placement
+    moq_tensors = _quantized_tensor_indices(onnx.load(output_path))
+    assert autotune_tensors == moq_tensors
+
+    # Check Q/DQ scales
+    scales_random = _collect_q_scales(autotune_model)
+    scales_calib = _collect_q_scales(onnx.load(output_path))
+    assert scales_random, "Expected at least one Q scale in the standalone Autotune model"
+    assert scales_calib, "Expected at least one Q scale in the MOQ + Autotune integrated model"
+    assert len(scales_random.keys()) == len(scales_calib.keys()), (
+        "Both models must quantize the same number of tensor"
+    )
+    assert all(
+        v != list(scales_calib.values())[idx] for idx, v in enumerate(scales_random.values())
+    ), (
+        "All or some Q/DQ scales are identical between the standalone Autotune and MOQ + Autotune integrated models. "
+        "The integrated quantization appears to have had no effect on scale computation."
+    )
diff --git a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
index 294501ff0..a2d61c507 100644
--- a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
+++ b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """
 Tests for PatternCache in the autotuner.
 
diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py
index 5a733017d..34e2cd244 100644
--- a/tests/unit/onnx/quantization/autotune/test_region.py
+++ b/tests/unit/onnx/quantization/autotune/test_region.py
@@ -13,12 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Tests for the Region class in the autotuner."""
 
 import pytest