diff --git a/modelopt/onnx/op_types.py b/modelopt/onnx/op_types.py index 7e11d25e6..42085e18f 100644 --- a/modelopt/onnx/op_types.py +++ b/modelopt/onnx/op_types.py @@ -386,3 +386,25 @@ def get_symmetric_ops(): "BitwiseOr", "BitwiseXor", } + + +def get_activation_ops(): + """Returns set of activation operations.""" + return { + "Relu", + "LeakyRelu", + "PRelu", + "Elu", + "Selu", + "ThresholdedRelu", + "Sigmoid", + "Tanh", + "HardSigmoid", + "Softmax", + "LogSoftmax", + "Clip", + "Softplus", + "Softsign", + "Swish", + "HardSwish", + } diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 6c79d9317..8a71291f1 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -20,6 +20,11 @@ import numpy as np +from modelopt.onnx.quantization.autotune import ( + MODE_PRESETS, + StoreWithExplicitFlag, + get_node_filter_list, +) from modelopt.onnx.quantization.quantize import quantize __all__ = ["main"] @@ -295,9 +300,128 @@ def get_parser() -> argparse.ArgumentParser: "if certain operations require a higher version." ), ) + argparser.add_argument( + "--autotune", + nargs="?", + const="default", + default=None, + choices=["quick", "default", "extensive"], + help=( + "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. " + "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): " + " - 'quick': fewer schemes and benchmark runs for quick exploration; " + " - 'default': balanced, recommended for most cases; " + " - 'extensive': more schemes and runs for extensive search and thorough tuning. " + "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset." + ), + ) + + autotune_group = argparser.add_argument_group( + "Autotune (only applicable when --autotune is set)" + ) + autotune_group.add_argument( + "--autotune_output_dir", + type=str, + default=None, + help="Output directory for autotune results (state file, logs). Default: temp directory.", + ) + autotune_group.add_argument( + "--autotune_schemes_per_region", + type=int, + default=MODE_PRESETS["default"]["schemes_per_region"], + help="Number of Q/DQ schemes to test per region.", + action=StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_schemes_per_region", + ) + autotune_group.add_argument( + "--autotune_pattern_cache", + type=str, + default=None, + dest="autotune_pattern_cache_file", + help="Path to pattern cache YAML for warm-start.", + ) + autotune_group.add_argument( + "--autotune_qdq_baseline", + type=str, + default=None, + help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.", + ) + autotune_group.add_argument( + "--autotune_state_file", + type=str, + default=None, + help="State file path for crash recovery and resume capability (default: /autotuner_state.yaml).", + ) + autotune_group.add_argument( + "--autotune_node_filter_list", + type=str, + default=None, + help=( + "Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " + "Regions without any matching nodes are skipped during autotuning." + ), + ) + autotune_group.add_argument( + "--autotune_verbose", + action="store_true", + help="Enable verbose logging in the autotuner.", + ) + autotune_group.add_argument( + "--autotune_use_trtexec", + action="store_true", + help="Use trtexec for benchmarking instead of the TensorRT Python API.", + ) + autotune_group.add_argument( + "--autotune_timing_cache", + type=str, + default=None, + help="TensorRT timing cache file for faster engine builds.", + ) + autotune_group.add_argument( + "--autotune_warmup_runs", + type=int, + default=MODE_PRESETS["default"]["warmup_runs"], + help="Number of warmup runs before timing.", + action=StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_warmup_runs", + ) + autotune_group.add_argument( + "--autotune_timing_runs", + type=int, + default=MODE_PRESETS["default"]["timing_runs"], + help="Number of timed runs for latency measurement.", + action=StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_timing_runs", + ) + autotune_group.add_argument( + "--autotune_trtexec_args", + type=str, + default=None, + help=( + "Additional trtexec arguments as a single quoted string. " + "Example: --autotune_trtexec_args '--fp16 --workspace=4096'" + ), + ) return argparser +def apply_mode_presets(args) -> None: + """Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs. + + Only applies preset for an option when that option was not explicitly set on the + command line (explicit flags override the preset). + """ + if args.autotune not in MODE_PRESETS: + return + preset = MODE_PRESETS[args.autotune] + if not getattr(args, "_explicit_autotune_schemes_per_region", False): + args.autotune_schemes_per_region = preset["schemes_per_region"] + if not getattr(args, "_explicit_autotune_warmup_runs", False): + args.autotune_warmup_runs = preset["warmup_runs"] + if not getattr(args, "_explicit_autotune_timing_runs", False): + args.autotune_timing_runs = preset["timing_runs"] + + def main(): """Command-line entrypoint for ONNX PTQ.""" args = get_parser().parse_args() @@ -331,6 +455,14 @@ def main(): else: raise + # Autotune configs + autotune_enabled = args.autotune is not None + if autotune_enabled: + apply_mode_presets(args) + autotune_node_filter_list = ( + get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None + ) + quantize( args.onnx_path, quantize_mode=args.quantize_mode, @@ -362,6 +494,19 @@ def main(): calibrate_per_node=args.calibrate_per_node, direct_io_types=args.direct_io_types, opset=args.opset, + autotune=autotune_enabled, + autotune_output_dir=args.autotune_output_dir, + autotune_num_schemes_per_region=args.autotune_schemes_per_region, + autotune_pattern_cache_file=args.autotune_pattern_cache_file, + autotune_state_file=args.autotune_state_file, + autotune_qdq_baseline=args.autotune_qdq_baseline, + autotune_node_filter_list=autotune_node_filter_list, + autotune_verbose=args.autotune_verbose, + autotune_use_trtexec=args.autotune_use_trtexec, + autotune_timing_cache=args.autotune_timing_cache, + autotune_warmup_runs=args.autotune_warmup_runs, + autotune_timing_runs=args.autotune_timing_runs, + autotune_trtexec_args=args.autotune_trtexec_args, ) diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py index 7f14bb360..74f44f972 100644 --- a/modelopt/onnx/quantization/autotune/__init__.py +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -20,6 +20,9 @@ region analysis to efficiently explore and optimize Q/DQ insertion strategies. """ +# Expose Autotune modes +from .__main__ import MODE_PRESETS + # Core data structures from .autotuner import QDQAutotuner from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark @@ -42,8 +45,10 @@ ) from .region_pattern import RegionPattern from .region_search import CombinedRegionSearch +from .utils import StoreWithExplicitFlag, get_node_filter_list __all__ = [ + "MODE_PRESETS", "AutotunerError", "AutotunerNotInitializedError", "ChildRegionInputInsertionPoint", @@ -60,6 +65,8 @@ "RegionPattern", "RegionType", "ResolvedInsertionPoint", + "StoreWithExplicitFlag", "TensorRTPyBenchmark", "TrtExecBenchmark", + "get_node_filter_list", ] diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index cb7b3c281..071ba6ceb 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -21,6 +21,11 @@ from pathlib import Path from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.utils import ( + StoreWithExplicitFlag, + get_node_filter_list, + validate_file_path, +) from modelopt.onnx.quantization.autotune.workflows import ( init_benchmark_instance, region_pattern_autotuning_workflow, @@ -44,18 +49,6 @@ } -class _StoreWithExplicitFlag(argparse.Action): - """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" - - def __init__(self, explicit_attr: str, *args, **kwargs): - self._explicit_attr = explicit_attr - super().__init__(*args, **kwargs) - - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, values) - setattr(namespace, self._explicit_attr, True) - - def apply_mode_presets(args) -> None: """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs. @@ -73,30 +66,6 @@ def apply_mode_presets(args) -> None: args.timing_runs = preset["timing_runs"] -def validate_file_path(path: str | None, description: str) -> Path | None: - """Validate that a file path exists. - - Args: - path: Path string to validate (can be None) - description: Description of the file for error messages - - Returns: - Path object if valid, None if path is None - - Raises: - SystemExit: If path is provided but doesn't exist - """ - if path is None: - return None - - path_obj = Path(path) - if not path_obj.exists(): - logger.error(f"{description} not found: {path_obj}") - sys.exit(1) - - return path_obj - - def log_benchmark_config(args): """Log TensorRT benchmark configuration for transparency. @@ -155,20 +124,9 @@ def run_autotune() -> int: return 1 try: - node_filter_list = None - if args.node_filter_list: - filter_file = validate_file_path(args.node_filter_list, "Node filter list file") - if filter_file: - with open(filter_file) as f: - node_filter_list = [ - line.strip() - for line in f - if line.strip() and not line.strip().startswith("#") - ] - logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") - + node_filter_list = get_node_filter_list(args.node_filter_list) region_pattern_autotuning_workflow( - model_path=str(model_path), + model_or_path=str(model_path), output_dir=output_dir, num_schemes_per_region=args.num_schemes, pattern_cache_file=args.pattern_cache_file, @@ -262,7 +220,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: type=int, default=DEFAULT_NUM_SCHEMES, dest="num_schemes", - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_num_schemes", help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)", ) @@ -328,7 +286,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--warmup_runs", type=int, default=DEFAULT_WARMUP_RUNS, - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_warmup_runs", help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)", ) @@ -336,7 +294,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--timing_runs", type=int, default=DEFAULT_TIMING_RUNS, - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_timing_runs", help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)", ) diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py index a519d7c61..6df297e95 100644 --- a/modelopt/onnx/quantization/autotune/autotuner_base.py +++ b/modelopt/onnx/quantization/autotune/autotuner_base.py @@ -35,7 +35,7 @@ import yaml from modelopt.onnx.logging_config import logger -from modelopt.onnx.op_types import is_linear_op +from modelopt.onnx.op_types import get_activation_ops, is_linear_op from modelopt.onnx.quantization.autotune.common import ( AutotunerNotInitializedError, Config, @@ -46,7 +46,10 @@ Region, ) from modelopt.onnx.quantization.autotune.export_utils import export_qdq_onnx -from modelopt.onnx.quantization.autotune.insertion_points import ResolvedInsertionPoint +from modelopt.onnx.quantization.autotune.insertion_points import ( + ResolvedInsertionPoint, + get_autotuner_quantizable_ops, +) from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern from modelopt.onnx.quantization.graph_utils import get_tensor_consumer_node_indices @@ -434,6 +437,125 @@ def _exclude_overlapping_insertion_points( if all_region_ips: logger.debug(f" → Excluded {len(all_region_ips)} overlapping insertion points") + @_requires_init + def get_resolved_insertion_points( + self, best: bool = True, verbose: bool = False + ) -> set[ResolvedInsertionPoint]: + """Compute Q/DQ insertion points for the best schemes (assuming best=True). + + Args: + best: If True, use the best scheme for each region. If False, use the current scheme. + verbose: If True, log matched-region counts and per-region insertion point details. + + Returns: + Set of ResolvedInsertionPoint objects representing where Q/DQ pairs should be inserted. + + Raises: + AutotunerNotInitializedError: If initialize() hasn't been called + """ + resolved_insertion_points: set[ResolvedInsertionPoint] = set() + matched_regions = 0 + + if verbose: + logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions") + + for region in self.regions: + current_scheme, pattern = self._resolve_scheme_for_region(region, best) + if current_scheme is None: + continue + self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern) + new_insertion_points = pattern.matches(region, self.graph, current_scheme) + if new_insertion_points: + resolved_insertion_points.update(new_insertion_points) + matched_regions += 1 + if verbose: + logger.debug(f" → Added {len(new_insertion_points)} insertion points") + if verbose: + logger.debug( + f"Matched {matched_regions}/{len(self.regions)} regions, " + f"total {len(resolved_insertion_points)} unique insertion points" + ) + return resolved_insertion_points + + @_requires_init + def get_ort_quantization_config( + self, + ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: + """Derive ORT quantization configuration from resolved insertion points. + + Returns the four parameters consumed by INT8 and FP8 quantize() to replicate the autotuner's + Q/DQ placement decisions without exporting any intermediate ONNX file to disk. + + Returns: + nodes_to_quantize: Node names that have at least one covered Q/DQ input. + op_types_to_quantize: Op types eligible for quantization. + no_quantize_inputs: List of (src_node, dst_node, tensor_name) tuples for inputs + of quantized nodes that should NOT receive Q/DQ. + op_types_needing_output_quant: Producer op types whose output feeds a covered + activation-op input (needed so ORT inserts Q/DQ between e.g. Add and Relu). + + Raises: + AutotunerNotInitializedError: If initialize() hasn't been called. + """ + resolved_ips = self.get_resolved_insertion_points(best=True) + graph = self.graph + + # Build (node_index, input_index) pairs that have Q/DQ + covered: set[tuple[int, int]] = set() + for ip in resolved_ips: + if ip.node_index is not None and ip.input_index is not None: + covered.add((ip.node_index, ip.input_index)) + else: + # Tensor-level insertion point: expand to all consumer (node, input) pairs + for consumer_idx in graph.tensor_users_map.get(ip.tensor_name, []): + node = graph.nodes[consumer_idx] + for inp_idx, inp in enumerate(node.inputs): + if getattr(inp, "name", None) == ip.tensor_name: + covered.add((consumer_idx, inp_idx)) + + # Nodes that consume a covered (DQ-fed) input + quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered} + + # Also include producer nodes of covered inputs: a producer whose output feeds a + # covered slot needs to be in nodes_to_quantize so ORT can place Q on its output + # (e.g., Add must be included when Q/DQ sits between Add and Relu). + node_name_to_idx = {node.name: i for i, node in enumerate(graph.nodes)} + for node_idx, inp_idx in covered: + tensor = graph.nodes[node_idx].inputs[inp_idx] + if tensor.inputs: + producer_idx = node_name_to_idx.get(tensor.inputs[0].name) + if producer_idx is not None: + quantized_node_indices.add(producer_idx) + + nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices] + op_types_to_quantize = list(get_autotuner_quantizable_ops()) + + # Inputs of quantized nodes NOT covered by Q/DQ (only non-constant producer inputs) + no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] = [] + for node_idx in quantized_node_indices: + node = graph.nodes[node_idx] + for inp_idx, inp in enumerate(node.inputs): + if (node_idx, inp_idx) not in covered and getattr(inp, "name", None): + if inp.inputs: + no_quantize_inputs.append((inp.inputs[0], node, inp.name)) + + # Producer op types whose output feeds a covered activation-op input + # (e.g., to support Add->Q/DQ->Relu patterns) + op_types_needing_output_quant: set[str] = set() + for node_idx, inp_idx in covered: + node = graph.nodes[node_idx] + if node.op in get_activation_ops(): + tensor = node.inputs[inp_idx] + if tensor.inputs: + op_types_needing_output_quant.add(tensor.inputs[0].op) + + return ( + nodes_to_quantize, + op_types_to_quantize, + no_quantize_inputs, + list(op_types_needing_output_quant), + ) + @_requires_init def export_onnx( self, output_path: str | None = None, insert_qdq: bool = True, best: bool = False @@ -469,29 +591,7 @@ def export_onnx( ) if insert_qdq: - matched_regions = 0 - - logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions") - - for region in self.regions: - current_scheme, pattern = self._resolve_scheme_for_region(region, best) - if current_scheme is None: - continue - - self._exclude_overlapping_insertion_points( - resolved_insertion_points, region, pattern - ) - - new_ips = pattern.matches(region, self.graph, current_scheme) - if new_ips: - resolved_insertion_points.update(new_ips) - matched_regions += 1 - logger.debug(f" → Added {len(new_ips)} insertion points") - - logger.debug( - f"Matched {matched_regions}/{len(self.regions)} regions, " - f"total {len(resolved_insertion_points)} unique insertion points" - ) + resolved_insertion_points = self.get_resolved_insertion_points(best=best, verbose=True) unique_tensors = len(resolved_insertion_points) diff --git a/modelopt/onnx/quantization/autotune/utils.py b/modelopt/onnx/quantization/autotune/utils.py new file mode 100644 index 000000000..8760b4bc1 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/utils.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions related to Autotune.""" + +import argparse +import sys +from pathlib import Path + +from modelopt.onnx.logging_config import logger + + +class StoreWithExplicitFlag(argparse.Action): + """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" + + def __init__(self, explicit_attr: str, *args, **kwargs): + """Initialize explicit attribute flag.""" + self._explicit_attr = explicit_attr + super().__init__(*args, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + """Set attributes.""" + setattr(namespace, self.dest, values) + setattr(namespace, self._explicit_attr, True) + + +def validate_file_path(path: str | None, description: str) -> Path | None: + """Validate that a file path exists. + + Args: + path: Path string to validate (can be None) + description: Description of the file for error messages + + Returns: + Path object if valid, None if path is None + + Raises: + SystemExit: If path is provided but doesn't exist + """ + if path is None: + return None + + path_obj = Path(path) + if not path_obj.exists(): + logger.error(f"{description} not found: {path_obj}") + sys.exit(1) + + return path_obj + + +def get_node_filter_list(node_filter_list_path: str) -> list | None: + """Extract node filter list from node filters path. + + Args: + node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). + + Returns: + Node filter list + """ + node_filter_list = None + if node_filter_list_path: + filter_file = validate_file_path(node_filter_list_path, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() for line in f if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + return node_filter_list diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index 025d9fac4..190882a31 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -20,6 +20,8 @@ """ import fnmatch +import shutil +import tempfile from pathlib import Path import onnx @@ -158,8 +160,8 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool: def region_pattern_autotuning_workflow( - model_path: str, - output_dir: Path, + model_or_path: str | onnx.ModelProto, + output_dir: Path | None = None, num_schemes_per_region: int = 30, pattern_cache_file: str | None = None, state_file: str | None = None, @@ -195,8 +197,8 @@ def region_pattern_autotuning_workflow( 7. Export final optimized model with best Q/DQ scheme for each pattern Args: - model_path: Path to ONNX model file to optimize - output_dir: Directory for output files (state, logs, models). Created if doesn't exist. + model_or_path: Path to ONNX model file to optimize + output_dir: Directory for output files (state, logs, models). Created if it doesn't exist. num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern. Higher values explore more configurations but take longer (default: 30) pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes @@ -205,6 +207,7 @@ def region_pattern_autotuning_workflow( uses /autotuner_state.yaml (default: None) quant_type: Quantization data type - "int8" for INT8 quantization (default), "fp8" for FP8 quantization + default_dq_dtype: Dtype for DequantizeLinear output; "float32" (default) or "float16". qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided, extracts Q/DQ insertion patterns and adds them to pattern cache for warm-start (default: None) @@ -215,6 +218,10 @@ def region_pattern_autotuning_workflow( Returns: QDQAutotuner instance after autotuning """ + output_dir_is_temp = output_dir is None + if not output_dir: + output_dir = Path(tempfile.mkdtemp()) + output_dir.mkdir(parents=True, exist_ok=True) logs_dir = output_dir / "logs" logs_dir.mkdir(exist_ok=True) @@ -225,8 +232,11 @@ def region_pattern_autotuning_workflow( state_file = str(output_dir / "autotuner_state.yaml") state_path = Path(state_file) - logger.info(f"Loading model: {model_path}") - model = onnx.load(model_path) + if isinstance(model_or_path, str): + logger.info(f"Loading model: {model_or_path}") + model = onnx.load(model_or_path) + else: + model = model_or_path pattern_cache = None if pattern_cache_file: @@ -373,4 +383,9 @@ def region_pattern_autotuning_workflow( logger.debug(f" Logs: {logs_dir}") logger.debug(f" Region models: {models_dir}") + # Remove temporary folder + if output_dir_is_temp and output_dir.exists(): + shutil.rmtree(output_dir) + logger.info(f"Temporary directory {output_dir} was deleted!") + return autotuner diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py index 76a3e8167..b7146173a 100755 --- a/modelopt/onnx/quantization/fp8.py +++ b/modelopt/onnx/quantization/fp8.py @@ -183,6 +183,7 @@ def quantize( custom_ops_to_quantize: list[str] = [], direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, **kwargs, ) -> onnx.ModelProto: """Applies FP8 GEMM only quantization to an ONNX file. @@ -215,10 +216,12 @@ def quantize( op_types_to_quantize.extend(list(custom_ops_to_quantize)) enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True) - if enable_gemv_detection_for_trt: + if enable_gemv_detection_for_trt and not autotune: # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. + # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements + # will be decided according to TensorRT's runtime measurements. logger.info("Detecting GEMV patterns for TRT optimization") matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude( onnx_path, @@ -233,7 +236,8 @@ def quantize( # Collect node names to exclude from quantization nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude) # type: ignore[arg-type] - nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")) + if not autotune: + nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")) # Change the default configuration of ORT quantization op_types = {node.op for node in graph.nodes} @@ -244,19 +248,22 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, + kwargs.get("op_types_needing_output_quant"), ) logger.info( f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}" ) # Collect node names to include in quantization - no_quantize_inputs = [] - nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) - if not nodes_to_quantize: - quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( - graph, quantizable_op_types, nodes_to_exclude - ) - nodes_to_quantize = [node.name for node in quantizable_nodes] + nodes_to_quantize = nodes_to_quantize or [] + no_quantize_inputs = kwargs.get("no_quantize_inputs", []) + if not autotune: + nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) + if not nodes_to_quantize: + quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( + graph, quantizable_op_types, nodes_to_exclude + ) + nodes_to_quantize = [node.name for node in quantizable_nodes] # Update the list of nodes to quantize nodes_to_quantize = [ diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py index efa77dd7b..131723e61 100755 --- a/modelopt/onnx/quantization/graph_utils.py +++ b/modelopt/onnx/quantization/graph_utils.py @@ -616,16 +616,37 @@ def remove_partial_input_qdq( # Reached end of the graph continue if dq_node.op == "DequantizeLinear": - dq_node = dq_node.outputs[0] # source_node->Q->DQ->target_node0 + dq_output = dq_node.outputs[0] # source_node->Q->DQ->target_node + + # Look up the specific target node in the quantized graph. + # With DedicatedQDQPair=False, a shared Q/DQ pair may feed multiple consumers + # (e.g. Conv activation AND Add residual). Always patch the intended target + # rather than the first consumer of the DQ output to avoid removing Q/DQ from + # the wrong branch. + target_node_in_graph = graph_nodes.get(target.name) + if target_node_in_graph is None: + continue - # Find the input index in the target connecting with source_node + # Find the input index in the target that is connected to the DQ output target_input_idx_arr = [ - idx for idx, inp in enumerate(dq_node.outputs[0].inputs) if inp.name == dq_node.name + idx + for idx, inp in enumerate(target_node_in_graph.inputs) + if inp.name == dq_output.name ] - target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0 + # If no input index is found (dq_output is not actually connected to target node), skip rewiring to + # prevent silent corruption of the graph. + if not target_input_idx_arr: + logger.warning( + "Expected DequantizeLinear output '%s' to be an input of node '%s', " + "but no matching input was found. Skipping Q/DQ bypass for this edge.", + dq_output.name, + target_node_in_graph.name, + ) + continue + target_input_idx = target_input_idx_arr[0] - # Connect the output of source_node with the output of DQ - dq_node.outputs[0].inputs[target_input_idx] = source_node.outputs[0] + # Connect the target's input directly to source_node's output (bypass Q/DQ) + target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0] # Check for quantized residual Adds where the parallel branch is not being quantized for source, target, non_qdq_input_name in no_quantize_inputs: diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py index 6e350a16f..ad2ca9558 100755 --- a/modelopt/onnx/quantization/int8.py +++ b/modelopt/onnx/quantization/int8.py @@ -133,6 +133,7 @@ def quantize( custom_ops_to_quantize: list[str] = [], direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, **kwargs, ) -> onnx.ModelProto: """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics. @@ -157,10 +158,12 @@ def quantize( return onnx_model enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True) - if enable_gemv_detection_for_trt: + if enable_gemv_detection_for_trt and not autotune: # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. + # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements + # will be decided according to TensorRT's runtime measurements. logger.info("Detecting GEMV patterns for TRT optimization") matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude( onnx_path, @@ -175,7 +178,8 @@ def quantize( # Collect node names to exclude from quantization nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude) # type: ignore[arg-type] - nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")) + if not autotune: + nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")) # Change the default configuration of ORT quantization op_types_to_quantize = op_types_to_quantize or [] @@ -189,22 +193,27 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, + kwargs.get("op_types_needing_output_quant"), ) logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}") # Collect node names to include in quantization - no_quantize_inputs = [] - nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) - if not nodes_to_quantize: - # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list - nodes_to_quantize = [node.name for node in graph.nodes if node.op in op_types_to_quantize] - - # If op_types_to_quantize is not provided, use default QDQ placement algorithm + nodes_to_quantize = nodes_to_quantize or [] + no_quantize_inputs = kwargs.get("no_quantize_inputs", []) + if not autotune: + nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) if not nodes_to_quantize: - quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( - graph, quantizable_op_types, nodes_to_exclude - ) - nodes_to_quantize = [node.name for node in quantizable_nodes] + # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list + nodes_to_quantize = [ + node.name for node in graph.nodes if node.op in op_types_to_quantize + ] + + # If op_types_to_quantize is not provided, use default QDQ placement algorithm + if not nodes_to_quantize: + quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( + graph, quantizable_op_types, nodes_to_exclude + ) + nodes_to_quantize = [node.name for node in quantizable_nodes] # Read the calibration cache and quantize nodes for which activation scale values are cached if calibration_cache_path: @@ -220,7 +229,8 @@ def quantize( logger.info( f"Skipping quantization of nodes: {set(nodes_to_quantize) - set(iq_quantized_nodes)}" ) - nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes)) + if not autotune: + nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes)) # Update the list of nodes to quantize nodes_to_quantize = [ diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index 5c89e20d7..173fbb06d 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -271,6 +271,7 @@ def configure_ort( calibration_eps: list[str] | None = None, calibrate_per_node: bool = False, custom_ops_to_quantize: list[str] = [], + op_types_needing_output_quant: list[str] | None = None, ): """Configure and patches ORT to support ModelOpt ONNX quantization.""" logger.info("Configuring ORT for ModelOpt ONNX quantization") @@ -291,7 +292,7 @@ def configure_ort( # Remove copy, reduction and activation ops from ORT QDQ registry logger.debug("Removing non-quantizable ops from QDQ registry") - for op_type in [ + for op_type in { "ArgMax", "Concat", "EmbedLayerNormalization", @@ -311,7 +312,7 @@ def configure_ort( "Transpose", "Unsqueeze", "Where", - ]: + } - set(op_types_to_quantize): if op_type in QLinearOpsRegistry: del QLinearOpsRegistry[op_type] if op_type in QDQRegistry: @@ -319,7 +320,10 @@ def configure_ort( # Prepare TensorRT friendly quantization settings no_output_quantization_op_types = [ - op_type for op_type in op_types if op_type not in custom_ops_to_quantize + op_type + for op_type in op_types + if op_type not in custom_ops_to_quantize + and op_type not in (op_types_needing_output_quant or []) ] if trt_extra_plugin_lib_paths is not None: trt_extra_plugin_lib_paths = ";".join(trt_extra_plugin_lib_paths) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index da7ff126d..b53904657 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -36,6 +36,7 @@ import shutil import tempfile from collections.abc import Sequence +from pathlib import Path from typing import Any import onnx @@ -45,6 +46,14 @@ from modelopt.onnx.logging_config import configure_logging, logger from modelopt.onnx.op_types import is_data_dependent_shape_op + +try: + from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, + ) +except ImportError: + logger.warning("Failed to import Autotune dependencies") from modelopt.onnx.quantization.calib_utils import ( CalibrationDataProvider, CalibrationDataType, @@ -242,6 +251,54 @@ def _preprocess_onnx( ) +def _find_nodes_to_quantize_autotune( + onnx_model: onnx.ModelProto, + quantize_mode: str, + trt_plugins: list[str] | None, + high_precision_dtype: str = "fp16", + output_dir: str | None = None, + num_schemes_per_region: int = 50, + pattern_cache_file: str | None = None, + state_file: str | None = None, + qdq_baseline_model: str | None = None, + node_filter_list: list[str] | None = None, + verbose: bool = False, + use_trtexec: bool = False, + timing_cache_file: str | None = None, + warmup_runs: int = 50, + timing_runs: int = 100, + trtexec_args: str | None = None, +) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: + """Extracts quantization information from Autotune to provide ORT quantization.""" + logger.info("Running Auto Q/DQ with TensorRT") + + benchmark_instance = init_benchmark_instance( + use_trtexec=use_trtexec, + plugin_libraries=trt_plugins, + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + trtexec_args=trtexec_args.split() if trtexec_args else None, + ) + if benchmark_instance is None: + raise RuntimeError("Failed to initialize TensorRT benchmark") + + precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} + autotuner = region_pattern_autotuning_workflow( + onnx_model, + output_dir=Path(output_dir) if output_dir else None, + num_schemes_per_region=num_schemes_per_region, + pattern_cache_file=pattern_cache_file, + state_file=state_file, + quant_type=quantize_mode, + default_dq_dtype=precision_map[high_precision_dtype], + qdq_baseline_model=qdq_baseline_model, + node_filter_list=node_filter_list, + verbose=verbose, + ) + return autotuner.get_ort_quantization_config() + + def quantize( onnx_path: str, quantize_mode: str = "int8", @@ -275,6 +332,19 @@ def quantize( input_shapes_profile: Sequence[dict[str, str]] | None = None, direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, + autotune_output_dir: str | None = None, + autotune_num_schemes_per_region: int = 50, + autotune_pattern_cache_file: str | None = None, + autotune_state_file: str | None = None, + autotune_qdq_baseline: str | None = None, + autotune_node_filter_list: list[str] | None = None, + autotune_verbose: bool = False, + autotune_use_trtexec: bool = False, + autotune_timing_cache: str | None = None, + autotune_warmup_runs: int = 50, + autotune_timing_runs: int = 100, + autotune_trtexec_args: str | None = None, **kwargs: Any, ) -> None: """Quantizes the provided ONNX model. @@ -398,6 +468,35 @@ def quantize( Target ONNX opset version for the quantized model. If None, uses required minimum opset (19 for int8/fp8, 21 for int4, 23 for nvfp4). If the specified opset is lower than the required minimum, a warning will be issued and the opset will be upgraded to the required minimum. + autotune: + If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available. + If False, use the default pattern-based quantization approach. + autotune_output_dir: + Output directory for autotune results (state file, logs). Default: temp directory. + autotune_num_schemes_per_region: + Number of Q/DQ schemes to test per region. + autotune_pattern_cache_file: + Path to pattern cache YAML for warm-start. + autotune_qdq_baseline: + Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start. + autotune_state_file: + State file path for crash recovery and resume capability (default: /autotuner_state.yaml). + autotune_node_filter_list: + Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). Regions without + any matching nodes are skipped during autotuning. + autotune_verbose: + Enable verbose logging in the autotuner. + autotune_use_trtexec: + Use trtexec for benchmarking instead of the TensorRT Python API. + autotune_timing_cache: + TensorRT timing cache file for faster engine builds. + autotune_warmup_runs: + Number of warmup runs before timing. + autotune_timing_runs: + Number of timed runs for latency measurement. + autotune_trtexec_args: + Additional trtexec arguments as a single quoted string. + Example: --autotune_trtexec_args '--fp16 --workspace=4096' kwargs: Additional keyword arguments for int4 quantization, including: - awqlite_alpha_step (float): Alpha step for lite, range [0, 1]. @@ -506,6 +605,35 @@ def quantize( calibration_shapes = get_input_shapes(onnx_path) if quantize_mode in ["fp8", "int8"]: + if autotune: + ( + nodes_to_quantize_autotune, + op_types_to_quantize_autotune, + no_quantize_inputs, + op_types_needing_output_quant, + ) = _find_nodes_to_quantize_autotune( + onnx_model, + quantize_mode, + trt_plugins, + high_precision_dtype, + output_dir=autotune_output_dir, + num_schemes_per_region=autotune_num_schemes_per_region, + pattern_cache_file=autotune_pattern_cache_file, + state_file=autotune_state_file, + qdq_baseline_model=autotune_qdq_baseline, + node_filter_list=autotune_node_filter_list, + verbose=autotune_verbose, + use_trtexec=autotune_use_trtexec, + timing_cache_file=autotune_timing_cache, + warmup_runs=autotune_warmup_runs, + timing_runs=autotune_timing_runs, + trtexec_args=autotune_trtexec_args, + ) + op_types_to_quantize = op_types_to_quantize or op_types_to_quantize_autotune + nodes_to_quantize = nodes_to_quantize or nodes_to_quantize_autotune + kwargs["no_quantize_inputs"] = no_quantize_inputs + kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant + quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8 onnx_model = quantize_func( onnx_path=onnx_path, @@ -531,8 +659,10 @@ def quantize( custom_ops_to_quantize=list(custom_ops_to_quantize.keys()), direct_io_types=direct_io_types, opset=opset, + autotune=autotune, **kwargs, ) + elif "int4" in quantize_mode: onnx_model = quantize_int4( onnx_path=onnx_path, diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index fc63f6690..84a8b4ab8 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -20,6 +20,8 @@ """ import onnx +import torch +import torch.nn as nn from onnx import helper @@ -52,3 +54,42 @@ def _create_simple_conv_onnx_model(): ], ) return helper.make_model(graph, producer_name="test") + + +def _create_simple_resnet18_model(): + """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests. + + Architecture: + Conv(3→64, 7×7, stride=2) → ReLU → MaxPool(3×3, stride=2) + → BasicBlock(64→64) → BasicBlock(64→64) + + Input shape: [1, 3, 1024, 1024], output shape: [1, 64, 256, 256]. + """ + + class _BasicBlock(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(64, 64, 3, padding=1, bias=True) + self.act1 = nn.ReLU() + self.conv2 = nn.Conv2d(64, 64, 3, padding=1, bias=True) + self.act2 = nn.ReLU() + + def forward(self, x): + return self.act2(self.conv2(self.act1(self.conv1(x))) + x) + + class _Model(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=True) + self.act1 = nn.ReLU() + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.layer1 = nn.Sequential(_BasicBlock(), _BasicBlock()) + + def forward(self, x): + return self.layer1(self.maxpool(self.act1(self.conv1(x)))) + + torch.manual_seed(42) + model = _Model().eval() + input_tensor = torch.zeros(1, 3, 1024, 1024) + + return model, input_tensor diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py new file mode 100644 index 000000000..829eebb55 --- /dev/null +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from unittest.mock import patch + +import onnx +import onnx_graphsurgeon as gs +from _test_utils.import_helper import skip_if_no_tensorrt +from _test_utils.onnx.lib_test_models import export_as_onnx +from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_model + +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) +from modelopt.onnx.quantization.quantize import _preprocess_onnx, quantize + +skip_if_no_tensorrt() + + +def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]: + """Return (node_name, input_index) for every DQ-fed input slot in the model.""" + graph = gs.import_onnx(model) + return { + (node.name, inp_idx) + for node in graph.nodes + for inp_idx, inp in enumerate(node.inputs) + if inp.inputs and inp.inputs[0].op == "DequantizeLinear" + } + + +def _collect_q_scales(model: onnx.ModelProto) -> dict[str, float]: + """Return {scale_initializer_name: float_value} for every QuantizeLinear node. + + Works for both float32 and float16 scale initializers (the latter produced by + the fp16-conversion pass that runs after ORT calibration). + """ + initializers = {init.name: init for init in model.graph.initializer} + scales = {} + for node in model.graph.node: + if node.op_type == "QuantizeLinear" and len(node.input) >= 2: + scale_name = node.input[1] + if scale_name in initializers: + raw = onnx.numpy_helper.to_array(initializers[scale_name]) + scales[scale_name] = float(raw.flat[0]) + return scales + + +def test_autotune_quantization_integration(tmp_path): + """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune. + + Also ensure that the scales in the Q/DQ nodes have been updated from standalone Autotune to MOQ with Autotune. + + Runs the autotuner once to obtain a fixed set of insertion points. The same autotuner instance is then injected + into quantize() via patching so that both sides reflect identical placement decisions without a second TRT + profiling run. + + Compares the set of (node_name, input_index) pairs where a DQ node feeds the input between: + - the autotuner's own export (via export_onnx), and + - the quantize(autotune=True) output model. + """ + model_torch, input_tensor = _create_simple_resnet18_model() + onnx_path = os.path.join(tmp_path, "model.onnx") + output_path = onnx_path.replace(".onnx", ".quant.onnx") + + # Export torch model to ONNX + export_as_onnx(model_torch, input_tensor, onnx_filename=onnx_path) + + # Load and pre-process ONNX + onnx_path, onnx_model, *_ = _preprocess_onnx( + onnx_path, + use_external_data_format=False, + output_path=output_path, + enable_shared_constants_duplication=True, + trt_plugins=None, + trt_plugins_precision=None, + override_shapes=None, # type: ignore[arg-type] + quantize_mode="int8", + ) + + # Run autotune once to get a determined set of placement decisions. + init_benchmark_instance(use_trtexec=False) + autotuner = region_pattern_autotuning_workflow( + onnx_model, + quant_type="int8", + default_dq_dtype="float16", + ) + + # Autotune path: export the Q/DQ model directly and collect quantized tensor slots. + autotune_model = onnx.load_from_string(autotuner.export_onnx(best=True)) + autotune_tensors = _quantized_tensor_indices(autotune_model) + + # MOQ + Autotune path: inject the same autotuner so placement decisions are identical, + # then run the full quantize() pipeline and collect quantized tensor slots. + with patch( + "modelopt.onnx.quantization.quantize.region_pattern_autotuning_workflow", + return_value=autotuner, + ): + quantize(onnx_path, autotune=True, output_path=output_path) + + # Check Q/DQ nodes placement + moq_tensors = _quantized_tensor_indices(onnx.load(output_path)) + assert autotune_tensors == moq_tensors + + # Check Q/DQ scales + scales_random = _collect_q_scales(autotune_model) + scales_calib = _collect_q_scales(onnx.load(output_path)) + assert scales_random, "Expected at least one Q scale in the standalone Autotune model" + assert scales_calib, "Expected at least one Q scale in the MOQ + Autotune integrated model" + assert len(scales_random.keys()) == len(scales_calib.keys()), ( + "Both models must quantize the same number of tensor" + ) + assert all( + v != list(scales_calib.values())[idx] for idx, v in enumerate(scales_random.values()) + ), ( + "All or some Q/DQ scales are identical between the standalone Autotune and MOQ + Autotune integrated models. " + "The integrated quantization appears to have had no effect on scale computation." + ) diff --git a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py index 294501ff0..a2d61c507 100644 --- a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py +++ b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + """ Tests for PatternCache in the autotuner. diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py index 5a733017d..34e2cd244 100644 --- a/tests/unit/onnx/quantization/autotune/test_region.py +++ b/tests/unit/onnx/quantization/autotune/test_region.py @@ -13,12 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """Tests for the Region class in the autotuner.""" import pytest