From 3554ecfbb121dff9b79f2cc768913d9d3b94d12f Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Fri, 23 Jan 2026 20:04:15 -0500 Subject: [PATCH 01/42] Initial autotune codebase Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/cli.py | 294 +++++++++++++++++++++ 1 file changed, 294 insertions(+) create mode 100644 modelopt/onnx/quantization/autotune/cli.py diff --git a/modelopt/onnx/quantization/autotune/cli.py b/modelopt/onnx/quantization/autotune/cli.py new file mode 100644 index 000000000..a5809f9a5 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/cli.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CLI argument parsing and execution for ONNX Q/DQ autotuning. + +This module provides `run_autotune` which handles both argument parsing and +workflow execution. See `__main__.py` for usage examples. +""" + +import argparse +import sys +from pathlib import Path + +from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) + +DEFAULT_OUTPUT_DIR = "./autotuner_output" +DEFAULT_NUM_SCHEMES = 30 +DEFAULT_QUANT_TYPE = "int8" +DEFAULT_DQ_DTYPE = "float32" +DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache" # nosec B108 +DEFAULT_WARMUP_RUNS = 5 +DEFAULT_TIMING_RUNS = 20 + + +def validate_file_path(path: str | None, description: str) -> Path | None: + """Validate that a file path exists. + + Args: + path: Path string to validate (can be None) + description: Description of the file for error messages + + Returns: + Path object if valid, None if path is None + + Raises: + SystemExit: If path is provided but doesn't exist + """ + if path is None: + return None + + path_obj = Path(path) + if not path_obj.exists(): + logger.error(f"{description} not found: {path_obj}") + sys.exit(1) + + return path_obj + + +def log_benchmark_config(args): + """Log TensorRT benchmark configuration for transparency. + + Logs timing cache path, warmup/timing run counts, and any custom + plugin libraries that will be loaded. + + Args: + args: Parsed command-line arguments with benchmark configuration + """ + logger.info("Initializing TensorRT benchmark") + logger.info(f" Timing cache: {args.timing_cache}") + logger.info(f" Warmup runs: {args.warmup_runs}") + logger.info(f" Timing runs: {args.timing_runs}") + if args.plugin_libraries: + logger.info(f" Plugin libraries: {', '.join(args.plugin_libraries)}") + + +def run_autotune(args=None) -> int: + """Execute the complete pattern-based Q/DQ autotuning workflow. + + This function orchestrates the entire optimization process: + 1. Parses command-line arguments (if not provided) + 2. Validates input paths (model, baseline, output directory) + 3. Initializes TensorRT benchmark instance + 4. Runs pattern-based region autotuning workflow + 5. Handles interruptions gracefully with state preservation + + Args: + args: Optional parsed command-line arguments. If None, parses sys.argv. + + Returns: + Exit code: + - 0: Success + - 1: Autotuning failed (exception occurred) + - 130: Interrupted by user (Ctrl+C) + """ + if args is None: + args = _get_autotune_parser().parse_args() + + model_path = validate_file_path(args.onnx_path, "Model file") + validate_file_path(args.qdq_baseline, "QDQ baseline model") + output_dir = Path(args.output) + + log_benchmark_config(args) + init_benchmark_instance( + use_trtexec=args.use_trtexec, + plugin_libraries=args.plugin_libraries, + timing_cache_file=args.timing_cache, + warmup_runs=args.warmup_runs, + timing_runs=args.timing_runs, + ) + + logger.info("Autotuning Mode: Pattern-Based") + + try: + node_filter_list = None + if args.node_filter_list: + filter_file = validate_file_path(args.node_filter_list, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() + for line in f + if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + + region_pattern_autotuning_workflow( + model_path=str(model_path), + output_dir=output_dir, + num_schemes_per_region=args.num_schemes, + pattern_cache_file=args.pattern_cache_file, + state_file=args.state_file, + quant_type=args.quant_type, + default_dq_dtype=args.default_dq_dtype, + qdq_baseline_model=args.qdq_baseline, + node_filter_list=node_filter_list, + ) + + logger.info("\n" + "=" * 70) + logger.info("✓ Autotuning completed successfully!") + logger.info(f"✓ Results: {output_dir}") + logger.info("=" * 70) + return 0 + + except KeyboardInterrupt: + logger.warning("\nInterrupted by user") + state_file = args.state_file or output_dir / "autotuner_state.yaml" + logger.info(f"Progress saved to: {state_file}") + return 130 + + except Exception as e: + logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose) + return 1 + + +def _get_autotune_parser() -> argparse.ArgumentParser: + """Create and configure the command-line argument parser.""" + parser = argparse.ArgumentParser( + prog="modelopt.onnx.quantization.autotune", + description="ONNX Q/DQ Autotuning with TensorRT", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx + + # Import patterns from QDQ baseline model + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --qdq_baseline baseline.onnx + + # Use pattern cache for warm-start + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml + + # Full example with all options + python -m modelopt.onnx.quantization.autotune \\ + --onnx_path model.onnx --schemes_per_region 50 \\ + --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\ + --quant_type int8 --verbose + """, + ) + + # Model and Output + io_group = parser.add_argument_group("Model and Output") + io_group.add_argument( + "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file" + ) + io_group.add_argument( + "--output", + "-o", + type=str, + default=DEFAULT_OUTPUT_DIR, + help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})", + ) + + # Autotuning Strategy + strategy_group = parser.add_argument_group("Autotuning Strategy") + strategy_group.add_argument( + "--schemes_per_region", + "-s", + type=int, + default=DEFAULT_NUM_SCHEMES, + dest="num_schemes", + help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})", + ) + strategy_group.add_argument( + "--pattern_cache", + type=str, + default=None, + dest="pattern_cache_file", + help="Path to pattern cache YAML for warm-start (optional)", + ) + strategy_group.add_argument( + "--qdq_baseline", + type=str, + default=None, + help="Path to QDQ baseline ONNX model to import quantization patterns (optional)", + ) + strategy_group.add_argument( + "--state_file", + type=str, + default=None, + help="State file path for resume capability (default: /autotuner_state.yaml)", + ) + strategy_group.add_argument( + "--node_filter_list", + type=str, + default=None, + help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " + "Regions without any matching nodes are skipped during autotuning.", + ) + + # Quantization + quant_group = parser.add_argument_group("Quantization") + quant_group.add_argument( + "--quant_type", + type=str, + default=DEFAULT_QUANT_TYPE, + choices=["int8", "fp8"], + help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})", + ) + quant_group.add_argument( + "--default_dq_dtype", + type=str, + default=DEFAULT_DQ_DTYPE, + choices=["float16", "float32", "bfloat16"], + help="Default DQ output dtype if cannot be deduced (optional)", + ) + + # TensorRT Benchmark + trt_group = parser.add_argument_group("TensorRT Benchmark") + trt_group.add_argument( + "--use_trtexec", + action="store_true", + help="Use trtexec for benchmarking (default: False)", + default=False, + ) + trt_group.add_argument( + "--timing_cache", + type=str, + default=DEFAULT_TIMING_CACHE, + help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})", + ) + trt_group.add_argument( + "--warmup_runs", + type=int, + default=DEFAULT_WARMUP_RUNS, + help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})", + ) + trt_group.add_argument( + "--timing_runs", + type=int, + default=DEFAULT_TIMING_RUNS, + help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})", + ) + trt_group.add_argument( + "--plugin_libraries", + "--plugins", + type=str, + nargs="+", + default=None, + dest="plugin_libraries", + help="TensorRT plugin libraries (.so files) to load (optional, space-separated)", + ) + + # Logging + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging") + + return parser From 56a67e3175778e8a49db26ddd33e4161dc5efe05 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 26 Jan 2026 10:21:21 -0500 Subject: [PATCH 02/42] Add more tests Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../onnx/quantization/autotune/test_config.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/unit/onnx/quantization/autotune/test_config.py diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py new file mode 100644 index 000000000..db6b02aa3 --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests for the Config class in the autotuner. + +Tests configuration parameter validation and defaults. +""" + +import os +import sys +import unittest + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from modelopt.onnx.quantization.autotune.common import Config + + +class TestConfig(unittest.TestCase): + """Test Config class functionality.""" + + def test_default_values(self): + """Test that Config has correct default values.""" + config = Config() + + # Logging + assert not config.verbose + + # Performance thresholds + + # Q/DQ defaults + assert config.default_q_scale == 0.1 + assert config.default_q_zero_point == 0 + assert config.default_quant_type == "int8" + + # Region builder settings + assert config.maximum_sequence_region_size == 10 + assert config.minimum_topdown_search_size == 10 + + # Scheme generation parameters + assert config.top_percent_to_mutate == 0.1 + assert config.minimum_schemes_to_mutate == 10 + assert config.maximum_mutations == 3 + assert config.maximum_generation_attempts == 100 + + # Pattern cache parameters + assert config.pattern_cache_minimum_distance == 4 + assert config.pattern_cache_max_entries_per_pattern == 32 + + print("✓ Config default values are correct") + + def test_custom_values(self): + """Test creating Config with custom values.""" + config = Config( + verbose=True, + default_q_scale=0.05, + default_q_zero_point=128, + default_quant_type="fp8", + maximum_sequence_region_size=20, + ) + + assert config.verbose + assert config.default_q_scale == 0.05 + assert config.default_q_zero_point == 128 + assert config.default_quant_type == "fp8" + assert config.maximum_sequence_region_size == 20 + print("✓ Config custom values work correctly") + + def test_region_size_validation(self): + """Test that region size parameters are positive.""" + config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5) + assert config.maximum_sequence_region_size > 0 + assert config.minimum_topdown_search_size > 0 + print("✓ Config region size validation") + + def test_genetic_algorithm_params(self): + """Test genetic algorithm parameters.""" + config = Config( + top_percent_to_mutate=0.2, + minimum_schemes_to_mutate=2, + maximum_mutations=5, + maximum_generation_attempts=50, + ) + + assert config.top_percent_to_mutate == 0.2 + assert config.minimum_schemes_to_mutate == 2 + assert config.maximum_mutations == 5 + assert config.maximum_generation_attempts == 50 + print("✓ Config genetic algorithm parameters") + + def test_pattern_cache_params(self): + """Test pattern cache parameters.""" + config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10) + + assert config.pattern_cache_minimum_distance == 3 + assert config.pattern_cache_max_entries_per_pattern == 10 + print("✓ Config pattern cache parameters") + + +def run_tests(): + """Run all Config tests.""" + print("=" * 70) + print("Config Class Test Suite") + print("=" * 70) + + loader = unittest.TestLoader() + suite = unittest.TestSuite() + suite.addTests(loader.loadTestsFromTestCase(TestConfig)) + + runner = unittest.TextTestRunner(verbosity=2) + result = runner.run(suite) + + print("\n" + "=" * 70) + print("Test Summary") + print("=" * 70) + print(f"Tests run: {result.testsRun}") + print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}") + print(f"Failures: {len(result.failures)}") + print(f"Errors: {len(result.errors)}") + + if result.wasSuccessful(): + print("\n✓ All Config tests passed!") + return 0 + else: + print("\n✗ Some tests failed") + return 1 + + +if __name__ == "__main__": + sys.exit(run_tests()) From 6d58b4a5482a590d1f079ee2e112b3208765f57c Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 26 Jan 2026 11:42:52 -0500 Subject: [PATCH 03/42] Refactor: PR #702 Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index d3b3de272..0c56a608a 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -531,12 +531,19 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: else: # Existing scheme is better, skip new one too_similar = True + if scheme.latency_ms < existing_scheme.latency_ms: + # New scheme is better, mark existing for replacement + schemes_to_replace.append(existing_scheme) break if existing_to_remove is not None: filtered_schemes.remove(existing_to_remove) if not too_similar: filtered_schemes.append(scheme) + elif schemes_to_replace: + for scheme_to_replace in schemes_to_replace: + filtered_schemes.remove(scheme_to_replace) + filtered_schemes.append(scheme) sorted_schemes = filtered_schemes From 710319ad58e15176bfb84641f54ecc6676fecaf5 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 26 Jan 2026 11:55:15 -0500 Subject: [PATCH 04/42] Remove python path in tests Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- tests/unit/onnx/quantization/autotune/test_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py index db6b02aa3..c5b20a8a9 100644 --- a/tests/unit/onnx/quantization/autotune/test_config.py +++ b/tests/unit/onnx/quantization/autotune/test_config.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # From 98a60b55c87dd91678189ea23325397cb0ba7dfb Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 27 Jan 2026 12:00:16 -0500 Subject: [PATCH 05/42] Recover docstrings and simplify code (->, , ) Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/common.py | 1 + tests/unit/onnx/quantization/autotune/test_region.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index 0c56a608a..fe22e19c0 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -524,6 +524,7 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: distance = scheme.distance(existing_scheme) if distance < self.minimum_distance: # Schemes are too similar, keep the better one + too_similar = True if scheme.latency_ms < existing_scheme.latency_ms: # New scheme is better; mark existing for removal existing_to_remove = existing_scheme diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py index 5a733017d..3bbf34ac9 100644 --- a/tests/unit/onnx/quantization/autotune/test_region.py +++ b/tests/unit/onnx/quantization/autotune/test_region.py @@ -59,7 +59,6 @@ def test_parent_child_relationship(parent_with_children): assert parent.get_children() == [child1, child2] assert child1.parent == child2.parent == parent - def test_add_and_get_nodes(leaf): leaf.nodes.update([0, 1, 2]) assert set(leaf.get_nodes()) == {0, 1, 2} @@ -79,7 +78,6 @@ def test_region_size_recursive(parent_with_children): parent.nodes.add(5) assert len(parent.get_region_nodes_and_descendants()) == 6 - def test_metadata(leaf): leaf.metadata.update({"pattern": "Conv->Relu", "quantizable": "true"}) assert leaf.metadata == {"pattern": "Conv->Relu", "quantizable": "true"} @@ -109,4 +107,4 @@ def test_remove_child(): parent.add_child(child) parent.remove_child(child) assert parent.get_children() == [] - assert child.parent is None + assert child.parent is None \ No newline at end of file From 91cef9c02288e27b5756621d608aaf012cd88e1a Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:03:02 -0500 Subject: [PATCH 06/42] Added unittest for workflows.py (failing) Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../quantization/autotune/test_workflows.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 tests/unit/onnx/quantization/autotune/test_workflows.py diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py new file mode 100644 index 000000000..40a323dce --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_workflows.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +import unittest +from pathlib import Path + +import onnx + +# Add parent directory to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model + +from modelopt.onnx.quantization.autotune.workflows import region_pattern_autotuning_workflow + + +class TestWorkflows(unittest.TestCase): + """Test workflows functionality.""" + + def test_export_quantized_model(self): + """Test exporting quantized model with Q/DQ.""" + model = create_simple_conv_model() + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + baseline_model_path = f.name + + # Save baseline model + onnx.save(model, baseline_model_path) + + output_dir = baseline_model_path.strip(".onnx") + output_path = output_dir + ".quant.onnx" + + try: + autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) + + # Export model with Q/DQ insertion + autotuner.export_onnx(output_path, insert_qdq=True) + + # Verify file was created + assert os.path.exists(output_path) + + # Verify it's a valid ONNX model + exported_model = onnx.load(output_path) + assert exported_model is not None + + # Verify that it contains Q/DQ nodes + qdq_nodes = [ + n + for n in exported_model.graph.node + if n.op_type in ["QuantizeLinear", "DequantizeLinear"] + ] + assert qdq_nodes, "Q/DQ nodes not found in quantized model" + + print("✓ QDQAutotuner export quantized model") + finally: + if os.path.exists(output_path): + os.unlink(output_path) From 7937cc25fd806277905d856b9e80082a96e1754b Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:06:37 -0500 Subject: [PATCH 07/42] Fix: 'Autotuning failed: 'PatternSchemes' object has no attribute 'node_inputs'' Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/region_pattern.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modelopt/onnx/quantization/autotune/region_pattern.py b/modelopt/onnx/quantization/autotune/region_pattern.py index a32273f84..9f80bd56e 100644 --- a/modelopt/onnx/quantization/autotune/region_pattern.py +++ b/modelopt/onnx/quantization/autotune/region_pattern.py @@ -21,7 +21,7 @@ import onnx_graphsurgeon as gs from modelopt.onnx.op_types import get_symmetric_ops -from modelopt.onnx.quantization.autotune.common import InsertionScheme, Region +from modelopt.onnx.quantization.autotune.common import InsertionScheme, PatternSchemes, Region from modelopt.onnx.quantization.autotune.insertion_points import ( ChildRegionInputInsertionPoint, ChildRegionOutputInsertionPoint, @@ -161,6 +161,9 @@ def matches( is provided but other is not a Region TypeError: If other is neither RegionPattern nor Region """ + if isinstance(scheme, PatternSchemes): + return set() + if isinstance(other, RegionPattern): if scheme is not None: raise ValueError("scheme parameter can only be used when matching against a Region") From 7c4e14b3912b90f25b2fb7d7bbfc44c78c14d9eb Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:38:48 -0500 Subject: [PATCH 08/42] Updated workflow test to test TRT and PythonTRT benchmarking Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../quantization/autotune/test_workflows.py | 74 ++++++++++--------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py index 40a323dce..c81794e5b 100644 --- a/tests/unit/onnx/quantization/autotune/test_workflows.py +++ b/tests/unit/onnx/quantization/autotune/test_workflows.py @@ -16,57 +16,65 @@ import os import sys import tempfile -import unittest from pathlib import Path import onnx +import pytest # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model -from modelopt.onnx.quantization.autotune.workflows import region_pattern_autotuning_workflow +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) -class TestWorkflows(unittest.TestCase): - """Test workflows functionality.""" +@pytest.mark.parametrize("use_trtexec", [True, False]) +def test_export_quantized_model(use_trtexec): + """Test exporting quantized model with Q/DQ.""" + if use_trtexec: + skip_if_no_trtexec() + else: + skip_if_no_tensorrt() - def test_export_quantized_model(self): - """Test exporting quantized model with Q/DQ.""" - model = create_simple_conv_model() + model = create_simple_conv_model() - with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: - baseline_model_path = f.name + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + baseline_model_path = f.name - # Save baseline model - onnx.save(model, baseline_model_path) + # Save baseline model + onnx.save(model, baseline_model_path) - output_dir = baseline_model_path.strip(".onnx") - output_path = output_dir + ".quant.onnx" + output_dir = baseline_model_path.strip(".onnx") + output_path = output_dir + ".quant.onnx" - try: - autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) + try: + init_benchmark_instance(use_trtexec=False) + autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) - # Export model with Q/DQ insertion - autotuner.export_onnx(output_path, insert_qdq=True) + # Export model with Q/DQ insertion + autotuner.export_onnx(output_path, insert_qdq=True) - # Verify file was created - assert os.path.exists(output_path) + # Verify file was created + assert os.path.exists(output_path) - # Verify it's a valid ONNX model - exported_model = onnx.load(output_path) - assert exported_model is not None + # Verify it's a valid ONNX model + exported_model = onnx.load(output_path) + assert exported_model is not None - # Verify that it contains Q/DQ nodes - qdq_nodes = [ - n - for n in exported_model.graph.node - if n.op_type in ["QuantizeLinear", "DequantizeLinear"] - ] - assert qdq_nodes, "Q/DQ nodes not found in quantized model" + # Verify that it contains Q/DQ nodes + qdq_nodes = [ + n + for n in exported_model.graph.node + if n.op_type in ["QuantizeLinear", "DequantizeLinear"] + ] + assert qdq_nodes, "Q/DQ nodes not found in quantized model" - print("✓ QDQAutotuner export quantized model") - finally: - if os.path.exists(output_path): - os.unlink(output_path) + print("✓ QDQAutotuner export quantized model") + finally: + if os.path.exists(output_path): + os.unlink(output_path) From 64836edf261d5da68152aacd1397dad33854bdfc Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:30:34 -0500 Subject: [PATCH 09/42] Fix test: use_trtexec flag Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- tests/unit/onnx/quantization/autotune/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py index c81794e5b..c8edafc06 100644 --- a/tests/unit/onnx/quantization/autotune/test_workflows.py +++ b/tests/unit/onnx/quantization/autotune/test_workflows.py @@ -53,7 +53,7 @@ def test_export_quantized_model(use_trtexec): output_path = output_dir + ".quant.onnx" try: - init_benchmark_instance(use_trtexec=False) + init_benchmark_instance(use_trtexec=use_trtexec) autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) # Export model with Q/DQ insertion From a9af36afbf2e4e542e7a02a376ee7beadcd6e806 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Thu, 19 Feb 2026 12:26:49 -0500 Subject: [PATCH 10/42] Add real scales to Q/DQ nodes Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/__main__.py | 8 ++ .../onnx/quantization/autotune/workflows.py | 26 ++++- modelopt/onnx/quantization/fp8.py | 24 +++-- modelopt/onnx/quantization/int8.py | 37 ++++--- modelopt/onnx/quantization/ort_utils.py | 56 +++++----- modelopt/onnx/quantization/quantize.py | 102 +++++++++++++++--- modelopt/onnx/utils.py | 18 ++++ 7 files changed, 201 insertions(+), 70 deletions(-) diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 6c79d9317..433980249 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -295,6 +295,13 @@ def get_parser() -> argparse.ArgumentParser: "if certain operations require a higher version." ), ) + argparser.add_argument( + "--autotune", + action="store_true", + help=( + "If set, detect optimal Q/DQ node placements according to the TensorRT version and platform available." + ), + ) return argparser @@ -362,6 +369,7 @@ def main(): calibrate_per_node=args.calibrate_per_node, direct_io_types=args.direct_io_types, opset=args.opset, + autotune=args.autotune, ) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index 025d9fac4..57ad73015 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -20,6 +20,8 @@ """ import fnmatch +import shutil +import tempfile from pathlib import Path import onnx @@ -158,8 +160,8 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool: def region_pattern_autotuning_workflow( - model_path: str, - output_dir: Path, + model_path: str | onnx.ModelProto, + output_dir: Path | None = None, num_schemes_per_region: int = 30, pattern_cache_file: str | None = None, state_file: str | None = None, @@ -168,6 +170,7 @@ def region_pattern_autotuning_workflow( qdq_baseline_model: str | None = None, node_filter_list: list[str] | None = None, verbose: bool = False, + keep_output_dir: bool = False, ) -> QDQAutotuner: """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model. @@ -196,7 +199,7 @@ def region_pattern_autotuning_workflow( Args: model_path: Path to ONNX model file to optimize - output_dir: Directory for output files (state, logs, models). Created if doesn't exist. + output_dir: Directory for output files (state, logs, models). Created if it doesn't exist. num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern. Higher values explore more configurations but take longer (default: 30) pattern_cache_file: Optional path to pattern cache YAML file containing known-good schemes @@ -211,10 +214,14 @@ def region_pattern_autotuning_workflow( node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions without any matching nodes are skipped during autotuning (default: None) verbose: Enable verbose logging in Config for detailed autotuner output (default: False) + keep_output_dir: If True, keep output_dir, otherwise, remove it at the end of this function. Returns: QDQAutotuner instance after autotuning """ + if not output_dir: + output_dir = Path(tempfile.mkdtemp()) + output_dir.mkdir(parents=True, exist_ok=True) logs_dir = output_dir / "logs" logs_dir.mkdir(exist_ok=True) @@ -225,8 +232,11 @@ def region_pattern_autotuning_workflow( state_file = str(output_dir / "autotuner_state.yaml") state_path = Path(state_file) - logger.info(f"Loading model: {model_path}") - model = onnx.load(model_path) + if isinstance(model_path, str): + logger.info(f"Loading model: {model_path}") + model = onnx.load(model_path) + else: + model = model_path pattern_cache = None if pattern_cache_file: @@ -373,4 +383,10 @@ def region_pattern_autotuning_workflow( logger.debug(f" Logs: {logs_dir}") logger.debug(f" Region models: {models_dir}") + if not keep_output_dir: + logger.debug( + f"Removing output dir: {output_dir}. Select 'keep_output_dir=False' if you wish to keep it." + ) + shutil.rmtree(output_dir) + return autotuner diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py index 76a3e8167..d8e0349ca 100755 --- a/modelopt/onnx/quantization/fp8.py +++ b/modelopt/onnx/quantization/fp8.py @@ -183,6 +183,8 @@ def quantize( custom_ops_to_quantize: list[str] = [], direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, + no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] | None = None, **kwargs, ) -> onnx.ModelProto: """Applies FP8 GEMM only quantization to an ONNX file. @@ -215,7 +217,7 @@ def quantize( op_types_to_quantize.extend(list(custom_ops_to_quantize)) enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True) - if enable_gemv_detection_for_trt: + if enable_gemv_detection_for_trt and not autotune: # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. @@ -233,7 +235,8 @@ def quantize( # Collect node names to exclude from quantization nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude) # type: ignore[arg-type] - nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")) + if not autotune: + nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="fp8")) # Change the default configuration of ORT quantization op_types = {node.op for node in graph.nodes} @@ -244,19 +247,22 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, + autotune, ) logger.info( f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}" ) # Collect node names to include in quantization - no_quantize_inputs = [] - nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) - if not nodes_to_quantize: - quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( - graph, quantizable_op_types, nodes_to_exclude - ) - nodes_to_quantize = [node.name for node in quantizable_nodes] + nodes_to_quantize = nodes_to_quantize or [] + no_quantize_inputs = no_quantize_inputs or [] + if not autotune: + nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) + if not nodes_to_quantize: + quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( + graph, quantizable_op_types, nodes_to_exclude + ) + nodes_to_quantize = [node.name for node in quantizable_nodes] # Update the list of nodes to quantize nodes_to_quantize = [ diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py index 6e350a16f..b1f781fb5 100755 --- a/modelopt/onnx/quantization/int8.py +++ b/modelopt/onnx/quantization/int8.py @@ -133,6 +133,8 @@ def quantize( custom_ops_to_quantize: list[str] = [], direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, + no_quantize_inputs: list[tuple[Node, Node, str]] | None = None, **kwargs, ) -> onnx.ModelProto: """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics. @@ -157,7 +159,7 @@ def quantize( return onnx_model enable_gemv_detection_for_trt = kwargs.get("enable_gemv_detection_for_trt", True) - if enable_gemv_detection_for_trt: + if enable_gemv_detection_for_trt and not autotune: # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. @@ -175,7 +177,8 @@ def quantize( # Collect node names to exclude from quantization nodes_to_exclude = find_nodes_to_exclude(graph, nodes_to_exclude, op_types_to_exclude) # type: ignore[arg-type] - nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")) + if not autotune: + nodes_to_exclude.extend(find_nodes_from_convs_to_exclude(graph, quantize_mode="int8")) # Change the default configuration of ORT quantization op_types_to_quantize = op_types_to_quantize or [] @@ -189,22 +192,27 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, + autotune, ) logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}") # Collect node names to include in quantization - no_quantize_inputs = [] - nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) - if not nodes_to_quantize: - # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list - nodes_to_quantize = [node.name for node in graph.nodes if node.op in op_types_to_quantize] - - # If op_types_to_quantize is not provided, use default QDQ placement algorithm + nodes_to_quantize = nodes_to_quantize or [] + no_quantize_inputs = no_quantize_inputs or [] + if not autotune: + nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) if not nodes_to_quantize: - quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( - graph, quantizable_op_types, nodes_to_exclude - ) - nodes_to_quantize = [node.name for node in quantizable_nodes] + # If nodes_to_quantize is not passed, use user supplied op_types_to_quantize list + nodes_to_quantize = [ + node.name for node in graph.nodes if node.op in op_types_to_quantize + ] + + # If op_types_to_quantize is not provided, use default QDQ placement algorithm + if not nodes_to_quantize: + quantizable_nodes, no_quantize_inputs = _find_nodes_to_quantize( + graph, quantizable_op_types, nodes_to_exclude + ) + nodes_to_quantize = [node.name for node in quantizable_nodes] # Read the calibration cache and quantize nodes for which activation scale values are cached if calibration_cache_path: @@ -220,7 +228,8 @@ def quantize( logger.info( f"Skipping quantization of nodes: {set(nodes_to_quantize) - set(iq_quantized_nodes)}" ) - nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes)) + if not autotune: + nodes_to_quantize = list(set(nodes_to_quantize).intersection(iq_quantized_nodes)) # Update the list of nodes to quantize nodes_to_quantize = [ diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index 5c89e20d7..089c8850f 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -271,6 +271,7 @@ def configure_ort( calibration_eps: list[str] | None = None, calibrate_per_node: bool = False, custom_ops_to_quantize: list[str] = [], + autotune: bool = False, ): """Configure and patches ORT to support ModelOpt ONNX quantization.""" logger.info("Configuring ORT for ModelOpt ONNX quantization") @@ -289,33 +290,34 @@ def configure_ort( # Patch ORT modules to fix bugs and support some edge cases patch_ort_modules(calibrate_per_node) - # Remove copy, reduction and activation ops from ORT QDQ registry - logger.debug("Removing non-quantizable ops from QDQ registry") - for op_type in [ - "ArgMax", - "Concat", - "EmbedLayerNormalization", - "Gather", - "GatherElements", - "GatherND", - "InstanceNormalization", - "LeakyRelu", - "Pad", - "Relu", - "Reshape", - "Slice", - "Sigmoid", - "Softmax", - "Split", - "Squeeze", - "Transpose", - "Unsqueeze", - "Where", - ]: - if op_type in QLinearOpsRegistry: - del QLinearOpsRegistry[op_type] - if op_type in QDQRegistry: - del QDQRegistry[op_type] + if not autotune: + # Remove copy, reduction and activation ops from ORT QDQ registry + logger.debug("Removing non-quantizable ops from QDQ registry") + for op_type in [ + "ArgMax", + "Concat", + "EmbedLayerNormalization", + "Gather", + "GatherElements", + "GatherND", + "InstanceNormalization", + "LeakyRelu", + "Pad", + "Relu", + "Reshape", + "Slice", + "Sigmoid", + "Softmax", + "Split", + "Squeeze", + "Transpose", + "Unsqueeze", + "Where", + ]: + if op_type in QLinearOpsRegistry: + del QLinearOpsRegistry[op_type] + if op_type in QDQRegistry: + del QDQRegistry[op_type] # Prepare TensorRT friendly quantization settings no_output_quantization_op_types = [ diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index da7ff126d..a4f631f9f 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -73,6 +73,7 @@ QDQ_PRECISION_MIN_OPSET, duplicate_shared_constants, get_opset_version, + get_quantized_nodes, name_onnx_nodes, save_onnx, ) @@ -242,6 +243,52 @@ def _preprocess_onnx( ) +def _find_nodes_to_quantize_autotune( + onnx_path: str, + onnx_model: onnx.ModelProto, + quantize_mode: str, + trt_plugins: list[str], + high_precision_dtype: str = "fp16", +) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]]]: + logger.info("Running Auto Q/DQ with TensorRT") + from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops + from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, + ) + + # Initialize Autotuner with the Python 'tensorrt' package + init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins) + precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} + autotuner = region_pattern_autotuning_workflow( + onnx_model, + quant_type=quantize_mode, + default_dq_dtype=precision_map[high_precision_dtype], + ) + + # Export model with Q/DQ insertion + onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx") + onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True) + # intermediate_generated_files.append(onnx_path_autotune) + + # Get nodes and op types to quantize + onnx_model_autotune = onnx.load_from_string(onnx_bytes) + nodes_to_quantize_autotune = get_quantized_nodes(onnx_model_autotune) + nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune] + op_types_to_quantize = list(get_autotuner_quantizable_ops()) + + # Get non-quantizable tensors + # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name) + no_quantize_inputs = [] + for node in nodes_to_quantize_autotune: + for idx, inp in enumerate(node.inputs): + if inp.inputs and inp.inputs[0].op != "DequantizeLinear": + src_node = node.i(idx) + no_quantize_inputs.append((src_node, node, inp.name)) + + return nodes_to_quantize_autotune_names, op_types_to_quantize, no_quantize_inputs + + def quantize( onnx_path: str, quantize_mode: str = "int8", @@ -275,6 +322,7 @@ def quantize( input_shapes_profile: Sequence[dict[str, str]] | None = None, direct_io_types: bool = False, opset: int | None = None, + autotune: bool = False, **kwargs: Any, ) -> None: """Quantizes the provided ONNX model. @@ -398,6 +446,9 @@ def quantize( Target ONNX opset version for the quantized model. If None, uses required minimum opset (19 for int8/fp8, 21 for int4, 23 for nvfp4). If the specified opset is lower than the required minimum, a warning will be issued and the opset will be upgraded to the required minimum. + autotune: + If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available. + If False, use the default pattern-based quantization approach. kwargs: Additional keyword arguments for int4 quantization, including: - awqlite_alpha_step (float): Alpha step for lite, range [0, 1]. @@ -486,26 +537,40 @@ def quantize( # Check op types spelling in 'op_types_to_exclude' and '_to_quantize' validate_op_types_spelling(onnx_path, op_types_to_quantize, op_types_to_exclude) - # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern. - # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to - # MatMuls in MHA pattern. - # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8 - # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern. - nodes_to_exclude = find_nodes_from_mha_to_exclude( - onnx_path, - use_external_data_format, - nodes_to_exclude, - disable_mha_qdq, - quantize_mode, - intermediate_generated_files, - calibration_data_reader, - calibration_eps, - ) + if not autotune: + # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern. + # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to + # MatMuls in MHA pattern. + # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8 + # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern. + nodes_to_exclude = find_nodes_from_mha_to_exclude( + onnx_path, + use_external_data_format, + nodes_to_exclude, + disable_mha_qdq, + quantize_mode, + intermediate_generated_files, + calibration_data_reader, + calibration_eps, + ) if calibrate_per_node and not calibration_shapes: calibration_shapes = get_input_shapes(onnx_path) if quantize_mode in ["fp8", "int8"]: + no_quantize_inputs = [] + if autotune: + nodes_to_quantize_autotune, op_types_to_quantize, no_quantize_inputs = ( + _find_nodes_to_quantize_autotune( + onnx_path, + onnx_model, + quantize_mode, + trt_plugins, + high_precision_dtype, + ) + ) + nodes_to_quantize.extend(nodes_to_quantize_autotune) + quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8 onnx_model = quantize_func( onnx_path=onnx_path, @@ -531,8 +596,15 @@ def quantize( custom_ops_to_quantize=list(custom_ops_to_quantize.keys()), direct_io_types=direct_io_types, opset=opset, + autotune=autotune, + no_quantize_inputs=no_quantize_inputs, **kwargs, ) + + # if autotune: + # # Copy real scales to quantized model + # print() + elif "int4" in quantize_mode: onnx_model = quantize_int4( onnx_path=onnx_path, diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py index 4025ea065..168af663b 100644 --- a/modelopt/onnx/utils.py +++ b/modelopt/onnx/utils.py @@ -172,6 +172,24 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto): return [inp for inp in graph.inputs if any(isinstance(s, str) or s <= 0 for s in inp.shape)] +def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list: + """This function returns the nodes preceded by a DQ node. + + Args: + onnx_model: ONNX model to traverse. + + Returns: + List of quantized nodes. + """ + graph = gs.import_onnx(onnx_model) + + return [ + node + for node in graph.nodes + if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs) + ] + + def _get_all_shapes(container: Any) -> dict[str, list[int]]: """This method returns the shape of tensors within a RepeatedCompositeContainer. From 29e8dd20dcde0f5503c7ec6e4425b08a2d7c1d8d Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 23 Feb 2026 20:26:34 -0500 Subject: [PATCH 11/42] fix precommit failures Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/common.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index fe22e19c0..8717685c7 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -524,7 +524,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: distance = scheme.distance(existing_scheme) if distance < self.minimum_distance: # Schemes are too similar, keep the better one - too_similar = True if scheme.latency_ms < existing_scheme.latency_ms: # New scheme is better; mark existing for removal existing_to_remove = existing_scheme @@ -541,10 +540,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: filtered_schemes.remove(existing_to_remove) if not too_similar: filtered_schemes.append(scheme) - elif schemes_to_replace: - for scheme_to_replace in schemes_to_replace: - filtered_schemes.remove(scheme_to_replace) - filtered_schemes.append(scheme) sorted_schemes = filtered_schemes From 7f69882cc9f7cb3cbd2118f7406d900f585c2672 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 23 Feb 2026 20:28:00 -0500 Subject: [PATCH 12/42] Fix: Add->Q/DQ->Activation(Relu) Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/op_types.py | 22 +++++++++ modelopt/onnx/quantization/fp8.py | 5 +- modelopt/onnx/quantization/int8.py | 5 +- modelopt/onnx/quantization/ort_utils.py | 62 +++++++++++++------------ modelopt/onnx/quantization/quantize.py | 62 ++++++++++++++++++------- modelopt/onnx/utils.py | 5 +- 6 files changed, 105 insertions(+), 56 deletions(-) diff --git a/modelopt/onnx/op_types.py b/modelopt/onnx/op_types.py index 7e11d25e6..42085e18f 100644 --- a/modelopt/onnx/op_types.py +++ b/modelopt/onnx/op_types.py @@ -386,3 +386,25 @@ def get_symmetric_ops(): "BitwiseOr", "BitwiseXor", } + + +def get_activation_ops(): + """Returns set of activation operations.""" + return { + "Relu", + "LeakyRelu", + "PRelu", + "Elu", + "Selu", + "ThresholdedRelu", + "Sigmoid", + "Tanh", + "HardSigmoid", + "Softmax", + "LogSoftmax", + "Clip", + "Softplus", + "Softsign", + "Swish", + "HardSwish", + } diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py index d8e0349ca..e181e1864 100755 --- a/modelopt/onnx/quantization/fp8.py +++ b/modelopt/onnx/quantization/fp8.py @@ -184,7 +184,6 @@ def quantize( direct_io_types: bool = False, opset: int | None = None, autotune: bool = False, - no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] | None = None, **kwargs, ) -> onnx.ModelProto: """Applies FP8 GEMM only quantization to an ONNX file. @@ -247,7 +246,7 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, - autotune, + kwargs.get("op_types_needing_output_quant"), ) logger.info( f"Quantizable op types in the model: {[t for t in op_types_to_quantize if t in op_types]}" @@ -255,7 +254,7 @@ def quantize( # Collect node names to include in quantization nodes_to_quantize = nodes_to_quantize or [] - no_quantize_inputs = no_quantize_inputs or [] + no_quantize_inputs = kwargs.get("no_quantize_inputs", []) if not autotune: nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) if not nodes_to_quantize: diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py index b1f781fb5..27c87abd4 100755 --- a/modelopt/onnx/quantization/int8.py +++ b/modelopt/onnx/quantization/int8.py @@ -134,7 +134,6 @@ def quantize( direct_io_types: bool = False, opset: int | None = None, autotune: bool = False, - no_quantize_inputs: list[tuple[Node, Node, str]] | None = None, **kwargs, ) -> onnx.ModelProto: """Applies INT8 quantization to an ONNX file using the compiler friendly heuristics. @@ -192,13 +191,13 @@ def quantize( calibration_eps, calibrate_per_node, custom_ops_to_quantize, - autotune, + kwargs.get("op_types_needing_output_quant"), ) logger.info(f"Quantizable op types: {[t for t in quantizable_op_types if t in op_types]}") # Collect node names to include in quantization nodes_to_quantize = nodes_to_quantize or [] - no_quantize_inputs = no_quantize_inputs or [] + no_quantize_inputs = kwargs.get("no_quantize_inputs", []) if not autotune: nodes_to_quantize = expand_node_names_from_patterns(graph, nodes_to_quantize) if not nodes_to_quantize: diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index 089c8850f..173fbb06d 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -271,7 +271,7 @@ def configure_ort( calibration_eps: list[str] | None = None, calibrate_per_node: bool = False, custom_ops_to_quantize: list[str] = [], - autotune: bool = False, + op_types_needing_output_quant: list[str] | None = None, ): """Configure and patches ORT to support ModelOpt ONNX quantization.""" logger.info("Configuring ORT for ModelOpt ONNX quantization") @@ -290,38 +290,40 @@ def configure_ort( # Patch ORT modules to fix bugs and support some edge cases patch_ort_modules(calibrate_per_node) - if not autotune: - # Remove copy, reduction and activation ops from ORT QDQ registry - logger.debug("Removing non-quantizable ops from QDQ registry") - for op_type in [ - "ArgMax", - "Concat", - "EmbedLayerNormalization", - "Gather", - "GatherElements", - "GatherND", - "InstanceNormalization", - "LeakyRelu", - "Pad", - "Relu", - "Reshape", - "Slice", - "Sigmoid", - "Softmax", - "Split", - "Squeeze", - "Transpose", - "Unsqueeze", - "Where", - ]: - if op_type in QLinearOpsRegistry: - del QLinearOpsRegistry[op_type] - if op_type in QDQRegistry: - del QDQRegistry[op_type] + # Remove copy, reduction and activation ops from ORT QDQ registry + logger.debug("Removing non-quantizable ops from QDQ registry") + for op_type in { + "ArgMax", + "Concat", + "EmbedLayerNormalization", + "Gather", + "GatherElements", + "GatherND", + "InstanceNormalization", + "LeakyRelu", + "Pad", + "Relu", + "Reshape", + "Slice", + "Sigmoid", + "Softmax", + "Split", + "Squeeze", + "Transpose", + "Unsqueeze", + "Where", + } - set(op_types_to_quantize): + if op_type in QLinearOpsRegistry: + del QLinearOpsRegistry[op_type] + if op_type in QDQRegistry: + del QDQRegistry[op_type] # Prepare TensorRT friendly quantization settings no_output_quantization_op_types = [ - op_type for op_type in op_types if op_type not in custom_ops_to_quantize + op_type + for op_type in op_types + if op_type not in custom_ops_to_quantize + and op_type not in (op_types_needing_output_quant or []) ] if trt_extra_plugin_lib_paths is not None: trt_extra_plugin_lib_paths = ";".join(trt_extra_plugin_lib_paths) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index a4f631f9f..743b5e368 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -44,7 +44,7 @@ import onnxslim from modelopt.onnx.logging_config import configure_logging, logger -from modelopt.onnx.op_types import is_data_dependent_shape_op +from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op from modelopt.onnx.quantization.calib_utils import ( CalibrationDataProvider, CalibrationDataType, @@ -249,7 +249,7 @@ def _find_nodes_to_quantize_autotune( quantize_mode: str, trt_plugins: list[str], high_precision_dtype: str = "fp16", -) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]]]: +) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops from modelopt.onnx.quantization.autotune.workflows import ( @@ -277,16 +277,43 @@ def _find_nodes_to_quantize_autotune( nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune] op_types_to_quantize = list(get_autotuner_quantizable_ops()) - # Get non-quantizable tensors + # Get non-quantizable tensors and identify op types whose outputs are quantized. # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name) no_quantize_inputs = [] + # List of ops to enable output quantization. + # By default, all ONNX standard ops have output quantization disabled due to TensorRT's quantization recipe + # (inputs and weights only). However, this causes QDQRemovableActivation (used for Relu, Sigmoid, etc.) to exit + # early when it checks is_tensor_quantized() on its input, producing no Q/DQ between e.g. Add and Relu. This list + # will be used in configure_ort() to enable output quantization of the ops included in it. + op_types_needing_output_quant = set() for node in nodes_to_quantize_autotune: for idx, inp in enumerate(node.inputs): if inp.inputs and inp.inputs[0].op != "DequantizeLinear": src_node = node.i(idx) no_quantize_inputs.append((src_node, node, inp.name)) + elif ( + inp.inputs + and inp.inputs[0].op == "DequantizeLinear" + and node.op in get_activation_ops() + ): + # Trace back through DQ→Q to find the node whose output is being quantized. + # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer + dq_node = inp.inputs[0] + quantized_tensor = dq_node.inputs[0] # Q's output (= DQ's input) + if quantized_tensor.inputs: + q_node = quantized_tensor.inputs[0] # QuantizeLinear node + if q_node.op == "QuantizeLinear" and q_node.inputs: + original_tensor = q_node.inputs[0] # e.g. Add_output_0 + if original_tensor.inputs: + producer = original_tensor.inputs[0] # e.g. Add + op_types_needing_output_quant.add(producer.op) - return nodes_to_quantize_autotune_names, op_types_to_quantize, no_quantize_inputs + return ( + nodes_to_quantize_autotune_names, + op_types_to_quantize, + no_quantize_inputs, + list(op_types_needing_output_quant), + ) def quantize( @@ -558,18 +585,22 @@ def quantize( calibration_shapes = get_input_shapes(onnx_path) if quantize_mode in ["fp8", "int8"]: - no_quantize_inputs = [] if autotune: - nodes_to_quantize_autotune, op_types_to_quantize, no_quantize_inputs = ( - _find_nodes_to_quantize_autotune( - onnx_path, - onnx_model, - quantize_mode, - trt_plugins, - high_precision_dtype, - ) + ( + nodes_to_quantize_autotune, + op_types_to_quantize, + no_quantize_inputs, + op_types_needing_output_quant, + ) = _find_nodes_to_quantize_autotune( + onnx_path, + onnx_model, + quantize_mode, + trt_plugins, + high_precision_dtype, ) nodes_to_quantize.extend(nodes_to_quantize_autotune) + kwargs["no_quantize_inputs"] = no_quantize_inputs + kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8 onnx_model = quantize_func( @@ -597,14 +628,9 @@ def quantize( direct_io_types=direct_io_types, opset=opset, autotune=autotune, - no_quantize_inputs=no_quantize_inputs, **kwargs, ) - # if autotune: - # # Copy real scales to quantized model - # print() - elif "int4" in quantize_mode: onnx_model = quantize_int4( onnx_path=onnx_path, diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py index 168af663b..e1e9715f1 100644 --- a/modelopt/onnx/utils.py +++ b/modelopt/onnx/utils.py @@ -173,13 +173,13 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto): def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list: - """This function returns the nodes preceded by a DQ node. + """This function returns the nodes preceded by a DQ node or followed by a Q node. Args: onnx_model: ONNX model to traverse. Returns: - List of quantized nodes. + List of quantized nodes (input or output). """ graph = gs.import_onnx(onnx_model) @@ -187,6 +187,7 @@ def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list: node for node in graph.nodes if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs) + or any(out.outputs[0].op == "QuantizeLinear" for out in node.outputs if out.outputs) ] From bb030bec6ef7d1d9ec84df0752df9f18273f284c Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 23 Feb 2026 20:29:14 -0500 Subject: [PATCH 13/42] Fix: correctly dequantize Add input with shared Q/DQ Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/graph_utils.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py index efa77dd7b..9ef88d4a9 100755 --- a/modelopt/onnx/quantization/graph_utils.py +++ b/modelopt/onnx/quantization/graph_utils.py @@ -616,16 +616,27 @@ def remove_partial_input_qdq( # Reached end of the graph continue if dq_node.op == "DequantizeLinear": - dq_node = dq_node.outputs[0] # source_node->Q->DQ->target_node0 + dq_output = dq_node.outputs[0] # source_node->Q->DQ->target_node + + # Look up the specific target node in the quantized graph. + # With DedicatedQDQPair=False, a shared Q/DQ pair may feed multiple consumers + # (e.g. Conv activation AND Add residual). Always patch the intended target + # rather than the first consumer of the DQ output to avoid removing Q/DQ from + # the wrong branch. + target_node_in_graph = graph_nodes.get(target.name) + if target_node_in_graph is None: + continue - # Find the input index in the target connecting with source_node + # Find the input index in the target that is connected to the DQ output target_input_idx_arr = [ - idx for idx, inp in enumerate(dq_node.outputs[0].inputs) if inp.name == dq_node.name + idx + for idx, inp in enumerate(target_node_in_graph.inputs) + if inp.name == dq_output.name ] target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0 - # Connect the output of source_node with the output of DQ - dq_node.outputs[0].inputs[target_input_idx] = source_node.outputs[0] + # Connect the target's input directly to source_node's output (bypass Q/DQ) + target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0] # Check for quantized residual Adds where the parallel branch is not being quantized for source, target, non_qdq_input_name in no_quantize_inputs: From 616227c944e48a94fd45f15929c0a1346f89b596 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 24 Feb 2026 12:21:47 -0500 Subject: [PATCH 14/42] [5916893] Fix weighted ops quantization logic: both input and weights Q/DQ need to be added or removed Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/autotuner.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/modelopt/onnx/quantization/autotune/autotuner.py b/modelopt/onnx/quantization/autotune/autotuner.py index 69038c59a..7afc50559 100644 --- a/modelopt/onnx/quantization/autotune/autotuner.py +++ b/modelopt/onnx/quantization/autotune/autotuner.py @@ -22,6 +22,19 @@ from modelopt.onnx.quantization.autotune.common import Config, PatternCache, Region, RegionType from modelopt.onnx.quantization.autotune.region_search import CombinedRegionSearch +_MUTATION_SPECS = [ + ("node_inputs", "node input points", lambda p: (p.node_index, p.input_index)), + ( + "child_region_inputs", + "region composite points", + lambda p: (p.region_index, p.input_index), + ), + ( + "region_outputs", + "region output points", + lambda p: (p.region_index, p.node_index, p.output_index), + ), +] class QDQAutotuner(QDQAutotunerBase): """Q/DQ autotuner with automatic region discovery around compute-intensive ops.""" From afee0a4a4504dd65a7f99cde15906ed77d2703b1 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:29:26 -0500 Subject: [PATCH 15/42] Changed keep_output_dir to True as default Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/common.py | 3 --- modelopt/onnx/quantization/autotune/workflows.py | 2 +- modelopt/onnx/quantization/quantize.py | 3 ++- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/common.py b/modelopt/onnx/quantization/autotune/common.py index 8717685c7..d3b3de272 100644 --- a/modelopt/onnx/quantization/autotune/common.py +++ b/modelopt/onnx/quantization/autotune/common.py @@ -531,9 +531,6 @@ def add_pattern_schemes(self, pattern_schemes: PatternSchemes) -> None: else: # Existing scheme is better, skip new one too_similar = True - if scheme.latency_ms < existing_scheme.latency_ms: - # New scheme is better, mark existing for replacement - schemes_to_replace.append(existing_scheme) break if existing_to_remove is not None: diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index 57ad73015..a8ba279e0 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -170,7 +170,7 @@ def region_pattern_autotuning_workflow( qdq_baseline_model: str | None = None, node_filter_list: list[str] | None = None, verbose: bool = False, - keep_output_dir: bool = False, + keep_output_dir: bool = True, ) -> QDQAutotuner: """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model. diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 743b5e368..64847057a 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -264,6 +264,7 @@ def _find_nodes_to_quantize_autotune( onnx_model, quant_type=quantize_mode, default_dq_dtype=precision_map[high_precision_dtype], + keep_output_dir=False, ) # Export model with Q/DQ insertion @@ -296,7 +297,7 @@ def _find_nodes_to_quantize_autotune( and inp.inputs[0].op == "DequantizeLinear" and node.op in get_activation_ops() ): - # Trace back through DQ→Q to find the node whose output is being quantized. + # Trace back through DQ → Q to find the node whose output is being quantized. # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer dq_node = inp.inputs[0] quantized_tensor = dq_node.inputs[0] # Q's output (= DQ's input) From faf0bbbf5f15f0e6f806b45f80c1ebb9e31bd782 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:46:25 -0500 Subject: [PATCH 16/42] test_workflow was moved to 'tests/gpu/onnx' Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../quantization/autotune/test_workflows.py | 80 ------------------- 1 file changed, 80 deletions(-) delete mode 100644 tests/unit/onnx/quantization/autotune/test_workflows.py diff --git a/tests/unit/onnx/quantization/autotune/test_workflows.py b/tests/unit/onnx/quantization/autotune/test_workflows.py deleted file mode 100644 index c8edafc06..000000000 --- a/tests/unit/onnx/quantization/autotune/test_workflows.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import tempfile -from pathlib import Path - -import onnx -import pytest - -# Add parent directory to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from _test_utils.import_helper import skip_if_no_tensorrt, skip_if_no_trtexec -from unit.onnx.quantization.autotune.test_autotuner import create_simple_conv_model - -from modelopt.onnx.quantization.autotune.workflows import ( - init_benchmark_instance, - region_pattern_autotuning_workflow, -) - - -@pytest.mark.parametrize("use_trtexec", [True, False]) -def test_export_quantized_model(use_trtexec): - """Test exporting quantized model with Q/DQ.""" - if use_trtexec: - skip_if_no_trtexec() - else: - skip_if_no_tensorrt() - - model = create_simple_conv_model() - - with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: - baseline_model_path = f.name - - # Save baseline model - onnx.save(model, baseline_model_path) - - output_dir = baseline_model_path.strip(".onnx") - output_path = output_dir + ".quant.onnx" - - try: - init_benchmark_instance(use_trtexec=use_trtexec) - autotuner = region_pattern_autotuning_workflow(baseline_model_path, Path(output_dir)) - - # Export model with Q/DQ insertion - autotuner.export_onnx(output_path, insert_qdq=True) - - # Verify file was created - assert os.path.exists(output_path) - - # Verify it's a valid ONNX model - exported_model = onnx.load(output_path) - assert exported_model is not None - - # Verify that it contains Q/DQ nodes - qdq_nodes = [ - n - for n in exported_model.graph.node - if n.op_type in ["QuantizeLinear", "DequantizeLinear"] - ] - assert qdq_nodes, "Q/DQ nodes not found in quantized model" - - print("✓ QDQAutotuner export quantized model") - finally: - if os.path.exists(output_path): - os.unlink(output_path) From 08bf713497fb4c02646a820e7323147cc321d5c2 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:56:50 -0500 Subject: [PATCH 17/42] Removed cli.py, moved into __main__.py Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/cli.py | 294 --------------------- 1 file changed, 294 deletions(-) delete mode 100644 modelopt/onnx/quantization/autotune/cli.py diff --git a/modelopt/onnx/quantization/autotune/cli.py b/modelopt/onnx/quantization/autotune/cli.py deleted file mode 100644 index a5809f9a5..000000000 --- a/modelopt/onnx/quantization/autotune/cli.py +++ /dev/null @@ -1,294 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""CLI argument parsing and execution for ONNX Q/DQ autotuning. - -This module provides `run_autotune` which handles both argument parsing and -workflow execution. See `__main__.py` for usage examples. -""" - -import argparse -import sys -from pathlib import Path - -from modelopt.onnx.logging_config import logger -from modelopt.onnx.quantization.autotune.workflows import ( - init_benchmark_instance, - region_pattern_autotuning_workflow, -) - -DEFAULT_OUTPUT_DIR = "./autotuner_output" -DEFAULT_NUM_SCHEMES = 30 -DEFAULT_QUANT_TYPE = "int8" -DEFAULT_DQ_DTYPE = "float32" -DEFAULT_TIMING_CACHE = "/tmp/trtexec_timing.cache" # nosec B108 -DEFAULT_WARMUP_RUNS = 5 -DEFAULT_TIMING_RUNS = 20 - - -def validate_file_path(path: str | None, description: str) -> Path | None: - """Validate that a file path exists. - - Args: - path: Path string to validate (can be None) - description: Description of the file for error messages - - Returns: - Path object if valid, None if path is None - - Raises: - SystemExit: If path is provided but doesn't exist - """ - if path is None: - return None - - path_obj = Path(path) - if not path_obj.exists(): - logger.error(f"{description} not found: {path_obj}") - sys.exit(1) - - return path_obj - - -def log_benchmark_config(args): - """Log TensorRT benchmark configuration for transparency. - - Logs timing cache path, warmup/timing run counts, and any custom - plugin libraries that will be loaded. - - Args: - args: Parsed command-line arguments with benchmark configuration - """ - logger.info("Initializing TensorRT benchmark") - logger.info(f" Timing cache: {args.timing_cache}") - logger.info(f" Warmup runs: {args.warmup_runs}") - logger.info(f" Timing runs: {args.timing_runs}") - if args.plugin_libraries: - logger.info(f" Plugin libraries: {', '.join(args.plugin_libraries)}") - - -def run_autotune(args=None) -> int: - """Execute the complete pattern-based Q/DQ autotuning workflow. - - This function orchestrates the entire optimization process: - 1. Parses command-line arguments (if not provided) - 2. Validates input paths (model, baseline, output directory) - 3. Initializes TensorRT benchmark instance - 4. Runs pattern-based region autotuning workflow - 5. Handles interruptions gracefully with state preservation - - Args: - args: Optional parsed command-line arguments. If None, parses sys.argv. - - Returns: - Exit code: - - 0: Success - - 1: Autotuning failed (exception occurred) - - 130: Interrupted by user (Ctrl+C) - """ - if args is None: - args = _get_autotune_parser().parse_args() - - model_path = validate_file_path(args.onnx_path, "Model file") - validate_file_path(args.qdq_baseline, "QDQ baseline model") - output_dir = Path(args.output) - - log_benchmark_config(args) - init_benchmark_instance( - use_trtexec=args.use_trtexec, - plugin_libraries=args.plugin_libraries, - timing_cache_file=args.timing_cache, - warmup_runs=args.warmup_runs, - timing_runs=args.timing_runs, - ) - - logger.info("Autotuning Mode: Pattern-Based") - - try: - node_filter_list = None - if args.node_filter_list: - filter_file = validate_file_path(args.node_filter_list, "Node filter list file") - if filter_file: - with open(filter_file) as f: - node_filter_list = [ - line.strip() - for line in f - if line.strip() and not line.strip().startswith("#") - ] - logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") - - region_pattern_autotuning_workflow( - model_path=str(model_path), - output_dir=output_dir, - num_schemes_per_region=args.num_schemes, - pattern_cache_file=args.pattern_cache_file, - state_file=args.state_file, - quant_type=args.quant_type, - default_dq_dtype=args.default_dq_dtype, - qdq_baseline_model=args.qdq_baseline, - node_filter_list=node_filter_list, - ) - - logger.info("\n" + "=" * 70) - logger.info("✓ Autotuning completed successfully!") - logger.info(f"✓ Results: {output_dir}") - logger.info("=" * 70) - return 0 - - except KeyboardInterrupt: - logger.warning("\nInterrupted by user") - state_file = args.state_file or output_dir / "autotuner_state.yaml" - logger.info(f"Progress saved to: {state_file}") - return 130 - - except Exception as e: - logger.error(f"\nAutotuning failed: {e}", exc_info=args.verbose) - return 1 - - -def _get_autotune_parser() -> argparse.ArgumentParser: - """Create and configure the command-line argument parser.""" - parser = argparse.ArgumentParser( - prog="modelopt.onnx.quantization.autotune", - description="ONNX Q/DQ Autotuning with TensorRT", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Basic usage - python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx - - # Import patterns from QDQ baseline model - python -m modelopt.onnx.quantization.autotune \\ - --onnx_path model.onnx --qdq_baseline baseline.onnx - - # Use pattern cache for warm-start - python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --pattern_cache cache.yaml - - # Full example with all options - python -m modelopt.onnx.quantization.autotune \\ - --onnx_path model.onnx --schemes_per_region 50 \\ - --pattern_cache cache.yaml --qdq_baseline baseline.onnx \\ - --quant_type int8 --verbose - """, - ) - - # Model and Output - io_group = parser.add_argument_group("Model and Output") - io_group.add_argument( - "--onnx_path", "-m", type=str, required=True, help="Path to ONNX model file" - ) - io_group.add_argument( - "--output", - "-o", - type=str, - default=DEFAULT_OUTPUT_DIR, - help=f"Output directory for results (default: {DEFAULT_OUTPUT_DIR})", - ) - - # Autotuning Strategy - strategy_group = parser.add_argument_group("Autotuning Strategy") - strategy_group.add_argument( - "--schemes_per_region", - "-s", - type=int, - default=DEFAULT_NUM_SCHEMES, - dest="num_schemes", - help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})", - ) - strategy_group.add_argument( - "--pattern_cache", - type=str, - default=None, - dest="pattern_cache_file", - help="Path to pattern cache YAML for warm-start (optional)", - ) - strategy_group.add_argument( - "--qdq_baseline", - type=str, - default=None, - help="Path to QDQ baseline ONNX model to import quantization patterns (optional)", - ) - strategy_group.add_argument( - "--state_file", - type=str, - default=None, - help="State file path for resume capability (default: /autotuner_state.yaml)", - ) - strategy_group.add_argument( - "--node_filter_list", - type=str, - default=None, - help="Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " - "Regions without any matching nodes are skipped during autotuning.", - ) - - # Quantization - quant_group = parser.add_argument_group("Quantization") - quant_group.add_argument( - "--quant_type", - type=str, - default=DEFAULT_QUANT_TYPE, - choices=["int8", "fp8"], - help=f"Quantization data type (default: {DEFAULT_QUANT_TYPE})", - ) - quant_group.add_argument( - "--default_dq_dtype", - type=str, - default=DEFAULT_DQ_DTYPE, - choices=["float16", "float32", "bfloat16"], - help="Default DQ output dtype if cannot be deduced (optional)", - ) - - # TensorRT Benchmark - trt_group = parser.add_argument_group("TensorRT Benchmark") - trt_group.add_argument( - "--use_trtexec", - action="store_true", - help="Use trtexec for benchmarking (default: False)", - default=False, - ) - trt_group.add_argument( - "--timing_cache", - type=str, - default=DEFAULT_TIMING_CACHE, - help=f"TensorRT timing cache file (default: {DEFAULT_TIMING_CACHE})", - ) - trt_group.add_argument( - "--warmup_runs", - type=int, - default=DEFAULT_WARMUP_RUNS, - help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})", - ) - trt_group.add_argument( - "--timing_runs", - type=int, - default=DEFAULT_TIMING_RUNS, - help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})", - ) - trt_group.add_argument( - "--plugin_libraries", - "--plugins", - type=str, - nargs="+", - default=None, - dest="plugin_libraries", - help="TensorRT plugin libraries (.so files) to load (optional, space-separated)", - ) - - # Logging - parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose DEBUG logging") - - return parser From 81fce4884f91a298cd5d7e6cfbb6c1dcf4afb67a Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:57:24 -0500 Subject: [PATCH 18/42] Removed PatternSchemes import from region_pattern.py: no longer needed. Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/region_pattern.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/region_pattern.py b/modelopt/onnx/quantization/autotune/region_pattern.py index 9f80bd56e..a32273f84 100644 --- a/modelopt/onnx/quantization/autotune/region_pattern.py +++ b/modelopt/onnx/quantization/autotune/region_pattern.py @@ -21,7 +21,7 @@ import onnx_graphsurgeon as gs from modelopt.onnx.op_types import get_symmetric_ops -from modelopt.onnx.quantization.autotune.common import InsertionScheme, PatternSchemes, Region +from modelopt.onnx.quantization.autotune.common import InsertionScheme, Region from modelopt.onnx.quantization.autotune.insertion_points import ( ChildRegionInputInsertionPoint, ChildRegionOutputInsertionPoint, @@ -161,9 +161,6 @@ def matches( is provided but other is not a Region TypeError: If other is neither RegionPattern nor Region """ - if isinstance(scheme, PatternSchemes): - return set() - if isinstance(other, RegionPattern): if scheme is not None: raise ValueError("scheme parameter can only be used when matching against a Region") From 7a57b8d47e32c8850747f3e8c7eb9d5bccfaeeaf Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:57:58 -0500 Subject: [PATCH 19/42] Added intermediate Autotune model to be removed at the end of the quantization workflow Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 64847057a..5e872e24b 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -249,6 +249,7 @@ def _find_nodes_to_quantize_autotune( quantize_mode: str, trt_plugins: list[str], high_precision_dtype: str = "fp16", + intermediate_generated_files: list[str] = [], ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops @@ -270,7 +271,7 @@ def _find_nodes_to_quantize_autotune( # Export model with Q/DQ insertion onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx") onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True) - # intermediate_generated_files.append(onnx_path_autotune) + intermediate_generated_files.append(onnx_path_autotune) # Get nodes and op types to quantize onnx_model_autotune = onnx.load_from_string(onnx_bytes) @@ -598,6 +599,7 @@ def quantize( quantize_mode, trt_plugins, high_precision_dtype, + intermediate_generated_files, ) nodes_to_quantize.extend(nodes_to_quantize_autotune) kwargs["no_quantize_inputs"] = no_quantize_inputs From a71fc914a2b9f6a88247466c453565608462416c Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 12:59:34 -0500 Subject: [PATCH 20/42] Removed _MUTATION_SPECS from autotuner.py: moved to autotuner_base.py Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/autotuner.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/autotuner.py b/modelopt/onnx/quantization/autotune/autotuner.py index 7afc50559..69038c59a 100644 --- a/modelopt/onnx/quantization/autotune/autotuner.py +++ b/modelopt/onnx/quantization/autotune/autotuner.py @@ -22,19 +22,6 @@ from modelopt.onnx.quantization.autotune.common import Config, PatternCache, Region, RegionType from modelopt.onnx.quantization.autotune.region_search import CombinedRegionSearch -_MUTATION_SPECS = [ - ("node_inputs", "node input points", lambda p: (p.node_index, p.input_index)), - ( - "child_region_inputs", - "region composite points", - lambda p: (p.region_index, p.input_index), - ), - ( - "region_outputs", - "region output points", - lambda p: (p.region_index, p.node_index, p.output_index), - ), -] class QDQAutotuner(QDQAutotunerBase): """Q/DQ autotuner with automatic region discovery around compute-intensive ops.""" From 01e8be0982b27376b9ec43791f37f4f26f2fa584 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 13:02:50 -0500 Subject: [PATCH 21/42] Removed test_config and test_pattern_cache. Should be added in the original Auto Q/DQ PR. Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../onnx/quantization/autotune/test_config.py | 143 ------------------ .../onnx/quantization/autotune/test_region.py | 4 +- 2 files changed, 3 insertions(+), 144 deletions(-) delete mode 100644 tests/unit/onnx/quantization/autotune/test_config.py diff --git a/tests/unit/onnx/quantization/autotune/test_config.py b/tests/unit/onnx/quantization/autotune/test_config.py deleted file mode 100644 index c5b20a8a9..000000000 --- a/tests/unit/onnx/quantization/autotune/test_config.py +++ /dev/null @@ -1,143 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Tests for the Config class in the autotuner. - -Tests configuration parameter validation and defaults. -""" - -import os -import sys -import unittest - -# Add parent directory to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from modelopt.onnx.quantization.autotune.common import Config - - -class TestConfig(unittest.TestCase): - """Test Config class functionality.""" - - def test_default_values(self): - """Test that Config has correct default values.""" - config = Config() - - # Logging - assert not config.verbose - - # Performance thresholds - - # Q/DQ defaults - assert config.default_q_scale == 0.1 - assert config.default_q_zero_point == 0 - assert config.default_quant_type == "int8" - - # Region builder settings - assert config.maximum_sequence_region_size == 10 - assert config.minimum_topdown_search_size == 10 - - # Scheme generation parameters - assert config.top_percent_to_mutate == 0.1 - assert config.minimum_schemes_to_mutate == 10 - assert config.maximum_mutations == 3 - assert config.maximum_generation_attempts == 100 - - # Pattern cache parameters - assert config.pattern_cache_minimum_distance == 4 - assert config.pattern_cache_max_entries_per_pattern == 32 - - print("✓ Config default values are correct") - - def test_custom_values(self): - """Test creating Config with custom values.""" - config = Config( - verbose=True, - default_q_scale=0.05, - default_q_zero_point=128, - default_quant_type="fp8", - maximum_sequence_region_size=20, - ) - - assert config.verbose - assert config.default_q_scale == 0.05 - assert config.default_q_zero_point == 128 - assert config.default_quant_type == "fp8" - assert config.maximum_sequence_region_size == 20 - print("✓ Config custom values work correctly") - - def test_region_size_validation(self): - """Test that region size parameters are positive.""" - config = Config(maximum_sequence_region_size=50, minimum_topdown_search_size=5) - assert config.maximum_sequence_region_size > 0 - assert config.minimum_topdown_search_size > 0 - print("✓ Config region size validation") - - def test_genetic_algorithm_params(self): - """Test genetic algorithm parameters.""" - config = Config( - top_percent_to_mutate=0.2, - minimum_schemes_to_mutate=2, - maximum_mutations=5, - maximum_generation_attempts=50, - ) - - assert config.top_percent_to_mutate == 0.2 - assert config.minimum_schemes_to_mutate == 2 - assert config.maximum_mutations == 5 - assert config.maximum_generation_attempts == 50 - print("✓ Config genetic algorithm parameters") - - def test_pattern_cache_params(self): - """Test pattern cache parameters.""" - config = Config(pattern_cache_minimum_distance=3, pattern_cache_max_entries_per_pattern=10) - - assert config.pattern_cache_minimum_distance == 3 - assert config.pattern_cache_max_entries_per_pattern == 10 - print("✓ Config pattern cache parameters") - - -def run_tests(): - """Run all Config tests.""" - print("=" * 70) - print("Config Class Test Suite") - print("=" * 70) - - loader = unittest.TestLoader() - suite = unittest.TestSuite() - suite.addTests(loader.loadTestsFromTestCase(TestConfig)) - - runner = unittest.TextTestRunner(verbosity=2) - result = runner.run(suite) - - print("\n" + "=" * 70) - print("Test Summary") - print("=" * 70) - print(f"Tests run: {result.testsRun}") - print(f"Successes: {result.testsRun - len(result.failures) - len(result.errors)}") - print(f"Failures: {len(result.failures)}") - print(f"Errors: {len(result.errors)}") - - if result.wasSuccessful(): - print("\n✓ All Config tests passed!") - return 0 - else: - print("\n✗ Some tests failed") - return 1 - - -if __name__ == "__main__": - sys.exit(run_tests()) diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py index 3bbf34ac9..5a733017d 100644 --- a/tests/unit/onnx/quantization/autotune/test_region.py +++ b/tests/unit/onnx/quantization/autotune/test_region.py @@ -59,6 +59,7 @@ def test_parent_child_relationship(parent_with_children): assert parent.get_children() == [child1, child2] assert child1.parent == child2.parent == parent + def test_add_and_get_nodes(leaf): leaf.nodes.update([0, 1, 2]) assert set(leaf.get_nodes()) == {0, 1, 2} @@ -78,6 +79,7 @@ def test_region_size_recursive(parent_with_children): parent.nodes.add(5) assert len(parent.get_region_nodes_and_descendants()) == 6 + def test_metadata(leaf): leaf.metadata.update({"pattern": "Conv->Relu", "quantizable": "true"}) assert leaf.metadata == {"pattern": "Conv->Relu", "quantizable": "true"} @@ -107,4 +109,4 @@ def test_remove_child(): parent.add_child(child) parent.remove_child(child) assert parent.get_children() == [] - assert child.parent is None \ No newline at end of file + assert child.parent is None From ad7a60da0f437f06c1976d589f444b6f47e33706 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:26:53 -0500 Subject: [PATCH 22/42] Fixed minor coderabbit suggestions Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/workflows.py | 2 +- modelopt/onnx/quantization/quantize.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index a8ba279e0..6dd84d4c1 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -385,7 +385,7 @@ def region_pattern_autotuning_workflow( if not keep_output_dir: logger.debug( - f"Removing output dir: {output_dir}. Select 'keep_output_dir=False' if you wish to keep it." + f"Removing output dir: {output_dir}. Set 'keep_output_dir=True' if you wish to keep it." ) shutil.rmtree(output_dir) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 5e872e24b..641dfefda 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -249,7 +249,7 @@ def _find_nodes_to_quantize_autotune( quantize_mode: str, trt_plugins: list[str], high_precision_dtype: str = "fp16", - intermediate_generated_files: list[str] = [], + intermediate_generated_files: list[str] | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops @@ -258,6 +258,9 @@ def _find_nodes_to_quantize_autotune( region_pattern_autotuning_workflow, ) + if intermediate_generated_files is None: + intermediate_generated_files = [] + # Initialize Autotuner with the Python 'tensorrt' package init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins) precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} From 7589668544f4c75632dc37e4a899233f16fbc832 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:29:05 -0500 Subject: [PATCH 23/42] Moved autotune imports to the top of the file Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 641dfefda..184483dec 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -45,6 +45,11 @@ from modelopt.onnx.logging_config import configure_logging, logger from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op +from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) from modelopt.onnx.quantization.calib_utils import ( CalibrationDataProvider, CalibrationDataType, @@ -252,12 +257,6 @@ def _find_nodes_to_quantize_autotune( intermediate_generated_files: list[str] | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") - from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops - from modelopt.onnx.quantization.autotune.workflows import ( - init_benchmark_instance, - region_pattern_autotuning_workflow, - ) - if intermediate_generated_files is None: intermediate_generated_files = [] From db4c3effd937a3648eb4f73dd26f7cc1a1b6af7c Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:02:52 -0500 Subject: [PATCH 24/42] Eliminate intermediate ONNX export in _find_nodes_to_quantize_autotune(). Directly use Insertion Points information. Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../quantization/autotune/autotuner_base.py | 136 ++++++++++++++---- modelopt/onnx/quantization/quantize.py | 62 +------- 2 files changed, 115 insertions(+), 83 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py index a519d7c61..e1ea3c29e 100644 --- a/modelopt/onnx/quantization/autotune/autotuner_base.py +++ b/modelopt/onnx/quantization/autotune/autotuner_base.py @@ -35,7 +35,7 @@ import yaml from modelopt.onnx.logging_config import logger -from modelopt.onnx.op_types import is_linear_op +from modelopt.onnx.op_types import get_activation_ops, is_linear_op from modelopt.onnx.quantization.autotune.common import ( AutotunerNotInitializedError, Config, @@ -46,7 +46,10 @@ Region, ) from modelopt.onnx.quantization.autotune.export_utils import export_qdq_onnx -from modelopt.onnx.quantization.autotune.insertion_points import ResolvedInsertionPoint +from modelopt.onnx.quantization.autotune.insertion_points import ( + ResolvedInsertionPoint, + get_autotuner_quantizable_ops, +) from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern from modelopt.onnx.quantization.graph_utils import get_tensor_consumer_node_indices @@ -434,6 +437,111 @@ def _exclude_overlapping_insertion_points( if all_region_ips: logger.debug(f" → Excluded {len(all_region_ips)} overlapping insertion points") + @_requires_init + def get_resolved_insertion_points( + self, best: bool = True, verbose: bool = False + ) -> set[ResolvedInsertionPoint]: + """Compute Q/DQ insertion points for the best schemes without exporting the model. + + Args: + best: If True, use the best scheme for each region. If False, use the current scheme. + verbose: If True, log matched-region counts and per-region insertion point details. + + Returns: + Set of ResolvedInsertionPoint objects representing where Q/DQ pairs should be inserted. + + Raises: + AutotunerNotInitializedError: If initialize() hasn't been called + """ + resolved_insertion_points: set[ResolvedInsertionPoint] = set() + matched_regions = 0 + + if verbose: + logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions") + + for region in self.regions: + current_scheme, pattern = self._resolve_scheme_for_region(region, best) + if current_scheme is None: + continue + self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern) + new_ips = pattern.matches(region, self.graph, current_scheme) + if new_ips: + resolved_insertion_points.update(new_ips) + matched_regions += 1 + if verbose: + logger.debug(f" → Added {len(new_ips)} insertion points") + if verbose: + logger.debug( + f"Matched {matched_regions}/{len(self.regions)} regions, " + f"total {len(resolved_insertion_points)} unique insertion points" + ) + return resolved_insertion_points + + @_requires_init + def get_ort_quantization_config( + self, + ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: + """Derive ORT quantization configuration from resolved insertion points. + + Returns the four parameters consumed by INT8 and FP8 quantize() to replicate the autotuner's + Q/DQ placement decisions without exporting any intermediate ONNX file to disk. + + Returns: + nodes_to_quantize: Node names that have at least one covered Q/DQ input. + op_types_to_quantize: Op types eligible for quantization. + no_quantize_inputs: List of (src_node, dst_node, tensor_name) tuples for inputs + of quantized nodes that should NOT receive Q/DQ. + op_types_needing_output_quant: Producer op types whose output feeds a covered + activation-op input (needed so ORT inserts Q/DQ between e.g. Add and Relu). + + Raises: + AutotunerNotInitializedError: If initialize() hasn't been called. + """ + resolved_ips = self.get_resolved_insertion_points(best=True) + graph = self.graph + + # Build (node_index, input_index) pairs that have Q/DQ + covered: set[tuple[int, int]] = set() + for ip in resolved_ips: + if ip.node_index is not None and ip.input_index is not None: + covered.add((ip.node_index, ip.input_index)) + else: + # Tensor-level insertion point: expand to all consumer (node, input) pairs + for consumer_idx in graph.tensor_users_map.get(ip.tensor_name, []): + node = graph.nodes[consumer_idx] + for inp_idx, inp in enumerate(node.inputs): + if getattr(inp, "name", None) == ip.tensor_name: + covered.add((consumer_idx, inp_idx)) + + quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered} + nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices] + op_types_to_quantize = list(get_autotuner_quantizable_ops()) + + # Inputs of quantized nodes NOT covered by Q/DQ (only non-constant producer inputs) + no_quantize_inputs: list[tuple[gs.Node, gs.Node, str]] = [] + for node_idx in quantized_node_indices: + node = graph.nodes[node_idx] + for inp_idx, inp in enumerate(node.inputs): + if (node_idx, inp_idx) not in covered and getattr(inp, "name", None): + if inp.inputs: + no_quantize_inputs.append((inp.inputs[0], node, inp.name)) + + # Producer op types whose output feeds a covered activation-op input + op_types_needing_output_quant: set[str] = set() + for node_idx, inp_idx in covered: + node = graph.nodes[node_idx] + if node.op in get_activation_ops(): + tensor = node.inputs[inp_idx] + if tensor.inputs: + op_types_needing_output_quant.add(tensor.inputs[0].op) + + return ( + nodes_to_quantize, + op_types_to_quantize, + no_quantize_inputs, + list(op_types_needing_output_quant), + ) + @_requires_init def export_onnx( self, output_path: str | None = None, insert_qdq: bool = True, best: bool = False @@ -469,29 +577,7 @@ def export_onnx( ) if insert_qdq: - matched_regions = 0 - - logger.debug(f"Resolving Q/DQ insertion points from {len(self.regions)} regions") - - for region in self.regions: - current_scheme, pattern = self._resolve_scheme_for_region(region, best) - if current_scheme is None: - continue - - self._exclude_overlapping_insertion_points( - resolved_insertion_points, region, pattern - ) - - new_ips = pattern.matches(region, self.graph, current_scheme) - if new_ips: - resolved_insertion_points.update(new_ips) - matched_regions += 1 - logger.debug(f" → Added {len(new_ips)} insertion points") - - logger.debug( - f"Matched {matched_regions}/{len(self.regions)} regions, " - f"total {len(resolved_insertion_points)} unique insertion points" - ) + resolved_insertion_points = self.get_resolved_insertion_points(best=best, verbose=True) unique_tensors = len(resolved_insertion_points) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 184483dec..8239fc3b3 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -44,8 +44,7 @@ import onnxslim from modelopt.onnx.logging_config import configure_logging, logger -from modelopt.onnx.op_types import get_activation_ops, is_data_dependent_shape_op -from modelopt.onnx.quantization.autotune.insertion_points import get_autotuner_quantizable_ops +from modelopt.onnx.op_types import is_data_dependent_shape_op from modelopt.onnx.quantization.autotune.workflows import ( init_benchmark_instance, region_pattern_autotuning_workflow, @@ -78,7 +77,6 @@ QDQ_PRECISION_MIN_OPSET, duplicate_shared_constants, get_opset_version, - get_quantized_nodes, name_onnx_nodes, save_onnx, ) @@ -249,75 +247,25 @@ def _preprocess_onnx( def _find_nodes_to_quantize_autotune( - onnx_path: str, onnx_model: onnx.ModelProto, quantize_mode: str, trt_plugins: list[str], high_precision_dtype: str = "fp16", - intermediate_generated_files: list[str] | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") - if intermediate_generated_files is None: - intermediate_generated_files = [] # Initialize Autotuner with the Python 'tensorrt' package init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins) precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} + + # Get Autotuner Q/DQ node placements autotuner = region_pattern_autotuning_workflow( onnx_model, quant_type=quantize_mode, default_dq_dtype=precision_map[high_precision_dtype], keep_output_dir=False, ) - - # Export model with Q/DQ insertion - onnx_path_autotune = onnx_path.replace(".onnx", ".quant_autotune.onnx") - onnx_bytes = autotuner.export_onnx(onnx_path_autotune, insert_qdq=True, best=True) - intermediate_generated_files.append(onnx_path_autotune) - - # Get nodes and op types to quantize - onnx_model_autotune = onnx.load_from_string(onnx_bytes) - nodes_to_quantize_autotune = get_quantized_nodes(onnx_model_autotune) - nodes_to_quantize_autotune_names = [n.name for n in nodes_to_quantize_autotune] - op_types_to_quantize = list(get_autotuner_quantizable_ops()) - - # Get non-quantizable tensors and identify op types whose outputs are quantized. - # List of non-quantizable tensors in the form of (src_node, dst_node, tensor_name) - no_quantize_inputs = [] - # List of ops to enable output quantization. - # By default, all ONNX standard ops have output quantization disabled due to TensorRT's quantization recipe - # (inputs and weights only). However, this causes QDQRemovableActivation (used for Relu, Sigmoid, etc.) to exit - # early when it checks is_tensor_quantized() on its input, producing no Q/DQ between e.g. Add and Relu. This list - # will be used in configure_ort() to enable output quantization of the ops included in it. - op_types_needing_output_quant = set() - for node in nodes_to_quantize_autotune: - for idx, inp in enumerate(node.inputs): - if inp.inputs and inp.inputs[0].op != "DequantizeLinear": - src_node = node.i(idx) - no_quantize_inputs.append((src_node, node, inp.name)) - elif ( - inp.inputs - and inp.inputs[0].op == "DequantizeLinear" - and node.op in get_activation_ops() - ): - # Trace back through DQ → Q to find the node whose output is being quantized. - # Path: node.input ← DQ ← quantized_tensor ← Q ← original_tensor ← producer - dq_node = inp.inputs[0] - quantized_tensor = dq_node.inputs[0] # Q's output (= DQ's input) - if quantized_tensor.inputs: - q_node = quantized_tensor.inputs[0] # QuantizeLinear node - if q_node.op == "QuantizeLinear" and q_node.inputs: - original_tensor = q_node.inputs[0] # e.g. Add_output_0 - if original_tensor.inputs: - producer = original_tensor.inputs[0] # e.g. Add - op_types_needing_output_quant.add(producer.op) - - return ( - nodes_to_quantize_autotune_names, - op_types_to_quantize, - no_quantize_inputs, - list(op_types_needing_output_quant), - ) + return autotuner.get_ort_quantization_config() def quantize( @@ -596,12 +544,10 @@ def quantize( no_quantize_inputs, op_types_needing_output_quant, ) = _find_nodes_to_quantize_autotune( - onnx_path, onnx_model, quantize_mode, trt_plugins, high_precision_dtype, - intermediate_generated_files, ) nodes_to_quantize.extend(nodes_to_quantize_autotune) kwargs["no_quantize_inputs"] = no_quantize_inputs From 42a0bdfe1f37ae61b470ba6b1e83f7a17c5323f0 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 13:39:58 -0500 Subject: [PATCH 25/42] Add support for Add->Q/DQ->Relu patterns by including those 'Add' nodes in the nodes_to_quantize Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../onnx/quantization/autotune/autotuner_base.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py index e1ea3c29e..1d5b18f14 100644 --- a/modelopt/onnx/quantization/autotune/autotuner_base.py +++ b/modelopt/onnx/quantization/autotune/autotuner_base.py @@ -441,7 +441,7 @@ def _exclude_overlapping_insertion_points( def get_resolved_insertion_points( self, best: bool = True, verbose: bool = False ) -> set[ResolvedInsertionPoint]: - """Compute Q/DQ insertion points for the best schemes without exporting the model. + """Compute Q/DQ insertion points for the best schemes (assuming best=True). Args: best: If True, use the best scheme for each region. If False, use the current scheme. @@ -513,7 +513,20 @@ def get_ort_quantization_config( if getattr(inp, "name", None) == ip.tensor_name: covered.add((consumer_idx, inp_idx)) + # Nodes that consume a covered (DQ-fed) input quantized_node_indices: set[int] = {node_idx for node_idx, _ in covered} + + # Also include producer nodes of covered inputs: a producer whose output feeds a + # covered slot needs to be in nodes_to_quantize so ORT can place Q on its output + # (e.g., Add must be included when Q/DQ sits between Add and Relu). + node_name_to_idx = {node.name: i for i, node in enumerate(graph.nodes)} + for node_idx, inp_idx in covered: + tensor = graph.nodes[node_idx].inputs[inp_idx] + if tensor.inputs: + producer_idx = node_name_to_idx.get(tensor.inputs[0].name) + if producer_idx is not None: + quantized_node_indices.add(producer_idx) + nodes_to_quantize = [graph.nodes[i].name for i in quantized_node_indices] op_types_to_quantize = list(get_autotuner_quantizable_ops()) @@ -527,6 +540,7 @@ def get_ort_quantization_config( no_quantize_inputs.append((inp.inputs[0], node, inp.name)) # Producer op types whose output feeds a covered activation-op input + # (e.g., to support Add->Q/DQ->Relu patterns) op_types_needing_output_quant: set[str] = set() for node_idx, inp_idx in covered: node = graph.nodes[node_idx] From a70dbd3be8f61ed99f80111d422415e366134c37 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 13:43:37 -0500 Subject: [PATCH 26/42] Add integration test Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../onnx/quantization/autotune/models.py | 52 ++++++++++ .../test_autotune_quantization_integration.py | 98 +++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 tests/gpu/onnx/quantization/test_autotune_quantization_integration.py diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index fc63f6690..e230b4e97 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -20,6 +20,8 @@ """ import onnx +import torch +import torch.nn as nn from onnx import helper @@ -52,3 +54,53 @@ def _create_simple_conv_onnx_model(): ], ) return helper.make_model(graph, producer_name="test") + + +def _create_simple_resnet18_onnx_model(): # -> onnx.ModelProto: + """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests. + + Architecture: + Conv(3→64, 7×7, stride=2) → ReLU → MaxPool(3×3, stride=2) + → BasicBlock(64→64) → BasicBlock(64→64) + + Input shape: [1, 3, 1024, 1024], output shape: [1, 64, 256, 256]. + """ + + class _BasicBlock(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(64, 64, 3, padding=1, bias=True) + self.act1 = nn.ReLU() + self.conv2 = nn.Conv2d(64, 64, 3, padding=1, bias=True) + self.act2 = nn.ReLU() + + def forward(self, x): + return self.act2(self.conv2(self.act1(self.conv1(x))) + x) + + class _Model(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=True) + self.act1 = nn.ReLU() + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + self.layer1 = nn.Sequential(_BasicBlock(), _BasicBlock()) + + def forward(self, x): + return self.layer1(self.maxpool(self.act1(self.conv1(x)))) + + torch.manual_seed(42) + model = _Model().eval() + input_tensor = torch.zeros(1, 3, 1024, 1024) + + return model, input_tensor + # buf = io.BytesIO() + # torch.onnx.export( + # model, + # dummy_input, + # buf, + # input_names=["input"], + # output_names=["output"], + # opset_version=17, + # ) + # buf.seek(0) + # return onnx.load_from_string(buf.read()) diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py new file mode 100644 index 000000000..a38bde128 --- /dev/null +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from unittest.mock import patch + +import onnx +import onnx_graphsurgeon as gs +from _test_utils.import_helper import skip_if_no_tensorrt +from _test_utils.onnx.lib_test_models import export_as_onnx +from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_onnx_model + +from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, +) +from modelopt.onnx.quantization.quantize import _preprocess_onnx, quantize + +skip_if_no_tensorrt() + + +def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]: + """Return (node_name, input_index) for every DQ-fed input slot in the model.""" + graph = gs.import_onnx(model) + return { + (node.name, inp_idx) + for node in graph.nodes + for inp_idx, inp in enumerate(node.inputs) + if inp.inputs and inp.inputs[0].op == "DequantizeLinear" + } + + +def test_autotune_quantization_integration(tmp_path="./"): + """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune. + + Runs the autotuner once to obtain a fixed set of insertion points. The same + autotuner instance is then injected into quantize() via patching so that both + sides reflect identical placement decisions without a second TRT profiling run. + + Compares the set of (node_name, input_index) pairs where a DQ node feeds the + input between: + - the autotuner's own export (via export_onnx), and + - the quantize(autotune=True) output model. + """ + model_torch, input_tensor = _create_simple_resnet18_onnx_model() + onnx_path = os.path.join(tmp_path, "model.onnx") + output_path = onnx_path.replace(".onnx", ".quant.onnx") + + # Export torch model to ONNX + export_as_onnx(model_torch, input_tensor, onnx_filename=onnx_path) + + # Load and pre-process ONNX + onnx_path, onnx_model, *_ = _preprocess_onnx( + onnx_path, + use_external_data_format=False, + output_path=output_path, + enable_shared_constants_duplication=True, + trt_plugins=None, + trt_plugins_precision=None, + override_shapes=None, # type: ignore[arg-type] + quantize_mode="int8", + ) + + # Run autotune once to get a determined set of placement decisions. + init_benchmark_instance(use_trtexec=False) + autotuner = region_pattern_autotuning_workflow( + onnx_model, + quant_type="int8", + default_dq_dtype="float16", + keep_output_dir=False, + ) + + # Autotune path: export the Q/DQ model directly and collect quantized tensor slots. + autotune_model = onnx.load_from_string(autotuner.export_onnx(best=True)) + autotune_tensors = _quantized_tensor_indices(autotune_model) + + # MOQ + Autotune path: inject the same autotuner so placement decisions are identical, + # then run the full quantize() pipeline and collect quantized tensor slots. + with patch( + "modelopt.onnx.quantization.quantize.region_pattern_autotuning_workflow", + return_value=autotuner, + ): + quantize(onnx_path, autotune=True, output_path=output_path) + + moq_tensors = _quantized_tensor_indices(onnx.load(output_path)) + assert autotune_tensors == moq_tensors From e1c8af7fcf22220ffe4226ab895d2ec80b303db5 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 14:25:13 -0500 Subject: [PATCH 27/42] Remove 'keep_output_dir' arg (no longer needed due to tmp path) Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/workflows.py | 9 --------- modelopt/onnx/quantization/quantize.py | 1 - 2 files changed, 10 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index 6dd84d4c1..a87dcf8b9 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -20,7 +20,6 @@ """ import fnmatch -import shutil import tempfile from pathlib import Path @@ -170,7 +169,6 @@ def region_pattern_autotuning_workflow( qdq_baseline_model: str | None = None, node_filter_list: list[str] | None = None, verbose: bool = False, - keep_output_dir: bool = True, ) -> QDQAutotuner: """Run automated Q/DQ (Quantization/Dequantization) optimization on an ONNX model. @@ -214,7 +212,6 @@ def region_pattern_autotuning_workflow( node_filter_list: Optional list of wildcard patterns to filter ONNX nodes. Regions without any matching nodes are skipped during autotuning (default: None) verbose: Enable verbose logging in Config for detailed autotuner output (default: False) - keep_output_dir: If True, keep output_dir, otherwise, remove it at the end of this function. Returns: QDQAutotuner instance after autotuning @@ -383,10 +380,4 @@ def region_pattern_autotuning_workflow( logger.debug(f" Logs: {logs_dir}") logger.debug(f" Region models: {models_dir}") - if not keep_output_dir: - logger.debug( - f"Removing output dir: {output_dir}. Set 'keep_output_dir=True' if you wish to keep it." - ) - shutil.rmtree(output_dir) - return autotuner diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 8239fc3b3..88acb7796 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -263,7 +263,6 @@ def _find_nodes_to_quantize_autotune( onnx_model, quant_type=quantize_mode, default_dq_dtype=precision_map[high_precision_dtype], - keep_output_dir=False, ) return autotuner.get_ort_quantization_config() From 159b9f25b0e47f8c24f096dfee607d8e298aab39 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 14:51:02 -0500 Subject: [PATCH 28/42] Remove 'get_quantized_nodes' and other comments that are no longer needed Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/utils.py | 19 ------------------- .../onnx/quantization/autotune/models.py | 11 ----------- 2 files changed, 30 deletions(-) diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py index e1e9715f1..4025ea065 100644 --- a/modelopt/onnx/utils.py +++ b/modelopt/onnx/utils.py @@ -172,25 +172,6 @@ def get_dynamic_graph_inputs(onnx_model: onnx.ModelProto): return [inp for inp in graph.inputs if any(isinstance(s, str) or s <= 0 for s in inp.shape)] -def get_quantized_nodes(onnx_model: onnx.ModelProto) -> list: - """This function returns the nodes preceded by a DQ node or followed by a Q node. - - Args: - onnx_model: ONNX model to traverse. - - Returns: - List of quantized nodes (input or output). - """ - graph = gs.import_onnx(onnx_model) - - return [ - node - for node in graph.nodes - if any(inp.inputs[0].op == "DequantizeLinear" for inp in node.inputs if inp.inputs) - or any(out.outputs[0].op == "QuantizeLinear" for out in node.outputs if out.outputs) - ] - - def _get_all_shapes(container: Any) -> dict[str, list[int]]: """This method returns the shape of tensors within a RepeatedCompositeContainer. diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index e230b4e97..68342b9ff 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -93,14 +93,3 @@ def forward(self, x): input_tensor = torch.zeros(1, 3, 1024, 1024) return model, input_tensor - # buf = io.BytesIO() - # torch.onnx.export( - # model, - # dummy_input, - # buf, - # input_names=["input"], - # output_names=["output"], - # opset_version=17, - # ) - # buf.seek(0) - # return onnx.load_from_string(buf.read()) From 51df98273576d45182d40db268a405545fcf544a Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:16:20 -0500 Subject: [PATCH 29/42] Added docstring for 'default_dq_dtype' in workflows.py Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/workflows.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index a87dcf8b9..483cf1314 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -206,6 +206,7 @@ def region_pattern_autotuning_workflow( uses /autotuner_state.yaml (default: None) quant_type: Quantization data type - "int8" for INT8 quantization (default), "fp8" for FP8 quantization + default_dq_dtype: Dtype for DequantizeLinear output; "float32" (default) or "float16". qdq_baseline_model: Optional path to a pre-quantized ONNX model. If provided, extracts Q/DQ insertion patterns and adds them to pattern cache for warm-start (default: None) From 1dc03cd264a1d4f352379a24d6e50b12b6e7b2e1 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:38:50 -0500 Subject: [PATCH 30/42] Added mode presets and additional autotune configurations Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/__main__.py | 139 +++++++++++++++++- .../onnx/quantization/autotune/__init__.py | 6 + .../onnx/quantization/autotune/__main__.py | 34 +++-- modelopt/onnx/quantization/quantize.py | 55 ++++++- 4 files changed, 217 insertions(+), 17 deletions(-) diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 433980249..4ba4f072b 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -20,6 +20,11 @@ import numpy as np +from modelopt.onnx.quantization.autotune import ( + MODE_PRESETS, + _StoreWithExplicitFlag, + get_node_filter_list, +) from modelopt.onnx.quantization.quantize import quantize __all__ = ["main"] @@ -297,14 +302,126 @@ def get_parser() -> argparse.ArgumentParser: ) argparser.add_argument( "--autotune", + nargs="?", + const="default", + default=None, + choices=["quick", "default", "extensive"], + help=( + "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes." + "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): " + " - 'quick': fewer schemes and benchmark runs for for quick exploration;" + " - 'default': balanced, recommended for most cases;" + " - 'extensive': more schemes and runs for extensive search and thorough tuning." + "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset." + ), + ) + + autotune_group = argparser.add_argument_group( + "Autotune (only applicable when --autotune is set)" + ) + autotune_group.add_argument( + "--autotune_output_dir", + type=str, + default=None, + help="Output directory for autotune results (state file, logs). Default: temp directory.", + ) + autotune_group.add_argument( + "--autotune_schemes_per_region", + type=int, + default=30, + help="Number of Q/DQ schemes to test per region.", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_schemes_per_region", + ) + autotune_group.add_argument( + "--autotune_pattern_cache", + type=str, + default=None, + dest="autotune_pattern_cache_file", + help="Path to pattern cache YAML for warm-start.", + ) + autotune_group.add_argument( + "--autotune_qdq_baseline", + type=str, + default=None, + help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.", + ) + autotune_group.add_argument( + "--autotune_state_file", + type=str, + default=None, + help="State file path for crash recovery and resume capability (default: /autotuner_state.yaml).", + ) + autotune_group.add_argument( + "--autotune_node_filter_list", + type=str, + default=None, + help=( + "Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). " + "Regions without any matching nodes are skipped during autotuning." + ), + ) + autotune_group.add_argument( + "--autotune_verbose", + action="store_true", + help="Enable verbose logging in the autotuner.", + ) + autotune_group.add_argument( + "--autotune_use_trtexec", action="store_true", + help="Use trtexec for benchmarking instead of the TensorRT Python API.", + ) + autotune_group.add_argument( + "--autotune_timing_cache", + type=str, + default=None, + help="TensorRT timing cache file for faster engine builds.", + ) + autotune_group.add_argument( + "--autotune_warmup_runs", + type=int, + default=5, + help="Number of warmup runs before timing.", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_warmup_runs", + ) + autotune_group.add_argument( + "--autotune_timing_runs", + type=int, + default=20, + help="Number of timed runs for latency measurement.", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_autotune_timing_runs", + ) + autotune_group.add_argument( + "--autotune_trtexec_args", + type=str, + default=None, help=( - "If set, detect optimal Q/DQ node placements according to the TensorRT version and platform available." + "Additional trtexec arguments as a single quoted string. " + "Example: --autotune_trtexec_args '--fp16 --workspace=4096'" ), ) return argparser +def apply_mode_presets(args) -> None: + """Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs. + + Only applies preset for an option when that option was not explicitly set on the + command line (explicit flags override the preset). + """ + if args.autotune not in MODE_PRESETS: + return + preset = MODE_PRESETS[args.autotune] + if not getattr(args, "_explicit_autotune_schemes_per_region", False): + args.autotune_schemes_per_region = preset["schemes_per_region"] + if not getattr(args, "_explicit_autotune_warmup_runs", False): + args.autotune_warmup_runs = preset["warmup_runs"] + if not getattr(args, "_explicit_autotune_timing_runs", False): + args.autotune_timing_runs = preset["timing_runs"] + + def main(): """Command-line entrypoint for ONNX PTQ.""" args = get_parser().parse_args() @@ -338,6 +455,12 @@ def main(): else: raise + # Autotune configs + autotune_enabled = args.autotune is not None + if autotune_enabled: + apply_mode_presets(args) + autotune_node_filter_list = get_node_filter_list(args.autotune_node_filter_list) + quantize( args.onnx_path, quantize_mode=args.quantize_mode, @@ -369,7 +492,19 @@ def main(): calibrate_per_node=args.calibrate_per_node, direct_io_types=args.direct_io_types, opset=args.opset, - autotune=args.autotune, + autotune=autotune_enabled, + autotune_output_dir=args.autotune_output_dir, + autotune_num_schemes_per_region=args.autotune_schemes_per_region, + autotune_pattern_cache_file=args.autotune_pattern_cache_file, + autotune_state_file=args.autotune_state_file, + autotune_qdq_baseline=args.autotune_qdq_baseline, + autotune_node_filter_list=autotune_node_filter_list, + autotune_verbose=args.autotune_verbose, + autotune_use_trtexec=args.autotune_use_trtexec, + autotune_timing_cache=args.autotune_timing_cache, + autotune_warmup_runs=args.autotune_warmup_runs, + autotune_timing_runs=args.autotune_timing_runs, + autotune_trtexec_args=args.autotune_trtexec_args, ) diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py index 7f14bb360..8243cfbef 100644 --- a/modelopt/onnx/quantization/autotune/__init__.py +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -20,6 +20,9 @@ region analysis to efficiently explore and optimize Q/DQ insertion strategies. """ +# Expose Autotune modes and args +from .__main__ import MODE_PRESETS, _StoreWithExplicitFlag, get_node_filter_list + # Core data structures from .autotuner import QDQAutotuner from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark @@ -44,6 +47,7 @@ from .region_search import CombinedRegionSearch __all__ = [ + "MODE_PRESETS", "AutotunerError", "AutotunerNotInitializedError", "ChildRegionInputInsertionPoint", @@ -62,4 +66,6 @@ "ResolvedInsertionPoint", "TensorRTPyBenchmark", "TrtExecBenchmark", + "_StoreWithExplicitFlag", + "get_node_filter_list", ] diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index cb7b3c281..9b4a2cd53 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -116,6 +116,27 @@ def log_benchmark_config(args): logger.info(f" Trtexec args: {args.trtexec_benchmark_args}") +def get_node_filter_list(node_filter_list_path: str) -> list | None: + """Extract node filter list from node filters path. + + Args: + node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). + + Returns: + Node filter list + """ + node_filter_list = None + if node_filter_list_path: + filter_file = validate_file_path(node_filter_list_path, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() for line in f if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + return node_filter_list + + def run_autotune() -> int: """Execute the complete pattern-based Q/DQ autotuning workflow. @@ -155,18 +176,7 @@ def run_autotune() -> int: return 1 try: - node_filter_list = None - if args.node_filter_list: - filter_file = validate_file_path(args.node_filter_list, "Node filter list file") - if filter_file: - with open(filter_file) as f: - node_filter_list = [ - line.strip() - for line in f - if line.strip() and not line.strip().startswith("#") - ] - logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") - + node_filter_list = get_node_filter_list(args.node_filter_list) region_pattern_autotuning_workflow( model_path=str(model_path), output_dir=output_dir, diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 88acb7796..397147ac1 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -36,6 +36,7 @@ import shutil import tempfile from collections.abc import Sequence +from pathlib import Path from typing import Any import onnx @@ -251,18 +252,42 @@ def _find_nodes_to_quantize_autotune( quantize_mode: str, trt_plugins: list[str], high_precision_dtype: str = "fp16", + output_dir: str | None = None, + num_schemes_per_region: int = 30, + pattern_cache_file: str | None = None, + state_file: str | None = None, + qdq_baseline_model: str | None = None, + node_filter_list: list[str] | None = None, + verbose: bool = False, + use_trtexec: bool = False, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 20, + trtexec_args: str | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") - # Initialize Autotuner with the Python 'tensorrt' package - init_benchmark_instance(use_trtexec=False, plugin_libraries=trt_plugins) + init_benchmark_instance( + use_trtexec=use_trtexec, + plugin_libraries=trt_plugins, + timing_cache_file=timing_cache_file, + warmup_runs=warmup_runs, + timing_runs=timing_runs, + trtexec_args=trtexec_args.split() if trtexec_args else None, + ) precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} - # Get Autotuner Q/DQ node placements autotuner = region_pattern_autotuning_workflow( onnx_model, + output_dir=Path(output_dir) if output_dir else None, + num_schemes_per_region=num_schemes_per_region, + pattern_cache_file=pattern_cache_file, + state_file=state_file, quant_type=quantize_mode, default_dq_dtype=precision_map[high_precision_dtype], + qdq_baseline_model=qdq_baseline_model, + node_filter_list=node_filter_list, + verbose=verbose, ) return autotuner.get_ort_quantization_config() @@ -301,6 +326,18 @@ def quantize( direct_io_types: bool = False, opset: int | None = None, autotune: bool = False, + autotune_output_dir: str | None = None, + autotune_num_schemes_per_region: int = 30, + autotune_pattern_cache_file: str | None = None, + autotune_state_file: str | None = None, + autotune_qdq_baseline: str | None = None, + autotune_node_filter_list: list[str] | None = None, + autotune_verbose: bool = False, + autotune_use_trtexec: bool = False, + autotune_timing_cache: str | None = None, + autotune_warmup_runs: int = 5, + autotune_timing_runs: int = 20, + autotune_trtexec_args: str | None = None, **kwargs: Any, ) -> None: """Quantizes the provided ONNX model. @@ -547,6 +584,18 @@ def quantize( quantize_mode, trt_plugins, high_precision_dtype, + output_dir=autotune_output_dir, + num_schemes_per_region=autotune_num_schemes_per_region, + pattern_cache_file=autotune_pattern_cache_file, + state_file=autotune_state_file, + qdq_baseline_model=autotune_qdq_baseline, + node_filter_list=autotune_node_filter_list, + verbose=autotune_verbose, + use_trtexec=autotune_use_trtexec, + timing_cache_file=autotune_timing_cache, + warmup_runs=autotune_warmup_runs, + timing_runs=autotune_timing_runs, + trtexec_args=autotune_trtexec_args, ) nodes_to_quantize.extend(nodes_to_quantize_autotune) kwargs["no_quantize_inputs"] = no_quantize_inputs From ddacbcb4cb999e774f73cb96ae39703288f580fb Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:03:21 -0500 Subject: [PATCH 31/42] Fixed tmp_path in test Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../quantization/test_autotune_quantization_integration.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py index a38bde128..86a377bd7 100644 --- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from unittest.mock import patch import onnx @@ -42,7 +41,7 @@ def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]: } -def test_autotune_quantization_integration(tmp_path="./"): +def test_autotune_quantization_integration(tmp_path): """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune. Runs the autotuner once to obtain a fixed set of insertion points. The same @@ -55,7 +54,7 @@ def test_autotune_quantization_integration(tmp_path="./"): - the quantize(autotune=True) output model. """ model_torch, input_tensor = _create_simple_resnet18_onnx_model() - onnx_path = os.path.join(tmp_path, "model.onnx") + onnx_path = tmp_path / "model.onnx" output_path = onnx_path.replace(".onnx", ".quant.onnx") # Export torch model to ONNX From 689a90781efa2887b3d6bb7f9420c58fcdf65316 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:27:30 -0500 Subject: [PATCH 32/42] Fixed copilot comments Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 2 +- tests/_test_utils/onnx/quantization/autotune/models.py | 2 +- .../onnx/quantization/test_autotune_quantization_integration.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 397147ac1..c8930283a 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -250,7 +250,7 @@ def _preprocess_onnx( def _find_nodes_to_quantize_autotune( onnx_model: onnx.ModelProto, quantize_mode: str, - trt_plugins: list[str], + trt_plugins: list[str] | None, high_precision_dtype: str = "fp16", output_dir: str | None = None, num_schemes_per_region: int = 30, diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index 68342b9ff..84a8b4ab8 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -56,7 +56,7 @@ def _create_simple_conv_onnx_model(): return helper.make_model(graph, producer_name="test") -def _create_simple_resnet18_onnx_model(): # -> onnx.ModelProto: +def _create_simple_resnet18_model(): """Build a ResNet-18 subgraph (stem + layer1) for MOQ + Autotuner integration tests. Architecture: diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py index 86a377bd7..6e889e131 100644 --- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -78,7 +78,6 @@ def test_autotune_quantization_integration(tmp_path): onnx_model, quant_type="int8", default_dq_dtype="float16", - keep_output_dir=False, ) # Autotune path: export the Q/DQ model directly and collect quantized tensor slots. From b64322fcae17ca20e469854ef3fec5a8bf33e414 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:41:27 -0500 Subject: [PATCH 33/42] Fix: skip rewiring in graph_utils if no index is found. This prevents silent corruption of the graph. Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/graph_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py index 9ef88d4a9..131723e61 100755 --- a/modelopt/onnx/quantization/graph_utils.py +++ b/modelopt/onnx/quantization/graph_utils.py @@ -633,7 +633,17 @@ def remove_partial_input_qdq( for idx, inp in enumerate(target_node_in_graph.inputs) if inp.name == dq_output.name ] - target_input_idx = target_input_idx_arr[0] if target_input_idx_arr else 0 + # If no input index is found (dq_output is not actually connected to target node), skip rewiring to + # prevent silent corruption of the graph. + if not target_input_idx_arr: + logger.warning( + "Expected DequantizeLinear output '%s' to be an input of node '%s', " + "but no matching input was found. Skipping Q/DQ bypass for this edge.", + dq_output.name, + target_node_in_graph.name, + ) + continue + target_input_idx = target_input_idx_arr[0] # Connect the target's input directly to source_node's output (bypass Q/DQ) target_node_in_graph.inputs[target_input_idx] = source_node.outputs[0] From 0a32bea36e93237ec3b34492329ef1dd08d958fb Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Thu, 5 Mar 2026 12:30:37 -0500 Subject: [PATCH 34/42] Match args for preset mode default Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 4ba4f072b..961ec0d5e 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -328,7 +328,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_schemes_per_region", type=int, - default=30, + default=50, help="Number of Q/DQ schemes to test per region.", action=_StoreWithExplicitFlag, explicit_attr="_explicit_autotune_schemes_per_region", @@ -380,7 +380,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_warmup_runs", type=int, - default=5, + default=50, help="Number of warmup runs before timing.", action=_StoreWithExplicitFlag, explicit_attr="_explicit_autotune_warmup_runs", @@ -388,7 +388,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_timing_runs", type=int, - default=20, + default=100, help="Number of timed runs for latency measurement.", action=_StoreWithExplicitFlag, explicit_attr="_explicit_autotune_timing_runs", From 7730b514782c4c55fd4e9cd28e0af127e4935e9a Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:02:05 -0400 Subject: [PATCH 35/42] Exposed _StoreWithExplicitFlag Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/__main__.py | 8 ++++---- modelopt/onnx/quantization/autotune/__init__.py | 4 ++-- modelopt/onnx/quantization/autotune/__main__.py | 10 ++++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index 961ec0d5e..e8ee53d76 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -22,7 +22,7 @@ from modelopt.onnx.quantization.autotune import ( MODE_PRESETS, - _StoreWithExplicitFlag, + StoreWithExplicitFlag, get_node_filter_list, ) from modelopt.onnx.quantization.quantize import quantize @@ -330,7 +330,7 @@ def get_parser() -> argparse.ArgumentParser: type=int, default=50, help="Number of Q/DQ schemes to test per region.", - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_schemes_per_region", ) autotune_group.add_argument( @@ -382,7 +382,7 @@ def get_parser() -> argparse.ArgumentParser: type=int, default=50, help="Number of warmup runs before timing.", - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_warmup_runs", ) autotune_group.add_argument( @@ -390,7 +390,7 @@ def get_parser() -> argparse.ArgumentParser: type=int, default=100, help="Number of timed runs for latency measurement.", - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_timing_runs", ) autotune_group.add_argument( diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py index 8243cfbef..b00e8c8f9 100644 --- a/modelopt/onnx/quantization/autotune/__init__.py +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -21,7 +21,7 @@ """ # Expose Autotune modes and args -from .__main__ import MODE_PRESETS, _StoreWithExplicitFlag, get_node_filter_list +from .__main__ import MODE_PRESETS, StoreWithExplicitFlag, get_node_filter_list # Core data structures from .autotuner import QDQAutotuner @@ -64,8 +64,8 @@ "RegionPattern", "RegionType", "ResolvedInsertionPoint", + "StoreWithExplicitFlag", "TensorRTPyBenchmark", "TrtExecBenchmark", - "_StoreWithExplicitFlag", "get_node_filter_list", ] diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index 9b4a2cd53..0b233d740 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -44,14 +44,16 @@ } -class _StoreWithExplicitFlag(argparse.Action): +class StoreWithExplicitFlag(argparse.Action): """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" def __init__(self, explicit_attr: str, *args, **kwargs): + """Initialize explicit attribute flag.""" self._explicit_attr = explicit_attr super().__init__(*args, **kwargs) def __call__(self, parser, namespace, values, option_string=None): + """Set attributes.""" setattr(namespace, self.dest, values) setattr(namespace, self._explicit_attr, True) @@ -272,7 +274,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: type=int, default=DEFAULT_NUM_SCHEMES, dest="num_schemes", - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_num_schemes", help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)", ) @@ -338,7 +340,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--warmup_runs", type=int, default=DEFAULT_WARMUP_RUNS, - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_warmup_runs", help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)", ) @@ -346,7 +348,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--timing_runs", type=int, default=DEFAULT_TIMING_RUNS, - action=_StoreWithExplicitFlag, + action=StoreWithExplicitFlag, explicit_attr="_explicit_timing_runs", help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)", ) From eb0e064bfffb9510d75aed1122ae42fcbafd7d74 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:24:32 -0400 Subject: [PATCH 36/42] Renamed new_ips to new_insertion_points Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/autotune/autotuner_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/autotuner_base.py b/modelopt/onnx/quantization/autotune/autotuner_base.py index 1d5b18f14..6df297e95 100644 --- a/modelopt/onnx/quantization/autotune/autotuner_base.py +++ b/modelopt/onnx/quantization/autotune/autotuner_base.py @@ -464,12 +464,12 @@ def get_resolved_insertion_points( if current_scheme is None: continue self._exclude_overlapping_insertion_points(resolved_insertion_points, region, pattern) - new_ips = pattern.matches(region, self.graph, current_scheme) - if new_ips: - resolved_insertion_points.update(new_ips) + new_insertion_points = pattern.matches(region, self.graph, current_scheme) + if new_insertion_points: + resolved_insertion_points.update(new_insertion_points) matched_regions += 1 if verbose: - logger.debug(f" → Added {len(new_ips)} insertion points") + logger.debug(f" → Added {len(new_insertion_points)} insertion points") if verbose: logger.debug( f"Matched {matched_regions}/{len(self.regions)} regions, " From 7cc54a5c0fc817ad9805a722d698fcc12da613a1 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:59:06 -0400 Subject: [PATCH 37/42] Address coderabbit and copilot issues + other minor issues Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/__main__.py | 18 +++++---- .../onnx/quantization/autotune/__main__.py | 2 +- .../onnx/quantization/autotune/workflows.py | 19 ++++++--- modelopt/onnx/quantization/quantize.py | 39 ++++++++++++++++--- .../test_autotune_quantization_integration.py | 7 ++-- 5 files changed, 62 insertions(+), 23 deletions(-) diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py index e8ee53d76..8a71291f1 100644 --- a/modelopt/onnx/quantization/__main__.py +++ b/modelopt/onnx/quantization/__main__.py @@ -307,11 +307,11 @@ def get_parser() -> argparse.ArgumentParser: default=None, choices=["quick", "default", "extensive"], help=( - "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes." + "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. " "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): " - " - 'quick': fewer schemes and benchmark runs for for quick exploration;" - " - 'default': balanced, recommended for most cases;" - " - 'extensive': more schemes and runs for extensive search and thorough tuning." + " - 'quick': fewer schemes and benchmark runs for quick exploration; " + " - 'default': balanced, recommended for most cases; " + " - 'extensive': more schemes and runs for extensive search and thorough tuning. " "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset." ), ) @@ -328,7 +328,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_schemes_per_region", type=int, - default=50, + default=MODE_PRESETS["default"]["schemes_per_region"], help="Number of Q/DQ schemes to test per region.", action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_schemes_per_region", @@ -380,7 +380,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_warmup_runs", type=int, - default=50, + default=MODE_PRESETS["default"]["warmup_runs"], help="Number of warmup runs before timing.", action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_warmup_runs", @@ -388,7 +388,7 @@ def get_parser() -> argparse.ArgumentParser: autotune_group.add_argument( "--autotune_timing_runs", type=int, - default=100, + default=MODE_PRESETS["default"]["timing_runs"], help="Number of timed runs for latency measurement.", action=StoreWithExplicitFlag, explicit_attr="_explicit_autotune_timing_runs", @@ -459,7 +459,9 @@ def main(): autotune_enabled = args.autotune is not None if autotune_enabled: apply_mode_presets(args) - autotune_node_filter_list = get_node_filter_list(args.autotune_node_filter_list) + autotune_node_filter_list = ( + get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None + ) quantize( args.onnx_path, diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index 0b233d740..a66585300 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -180,7 +180,7 @@ def run_autotune() -> int: try: node_filter_list = get_node_filter_list(args.node_filter_list) region_pattern_autotuning_workflow( - model_path=str(model_path), + model_or_path=str(model_path), output_dir=output_dir, num_schemes_per_region=args.num_schemes, pattern_cache_file=args.pattern_cache_file, diff --git a/modelopt/onnx/quantization/autotune/workflows.py b/modelopt/onnx/quantization/autotune/workflows.py index 483cf1314..190882a31 100644 --- a/modelopt/onnx/quantization/autotune/workflows.py +++ b/modelopt/onnx/quantization/autotune/workflows.py @@ -20,6 +20,7 @@ """ import fnmatch +import shutil import tempfile from pathlib import Path @@ -159,7 +160,7 @@ def _region_matches_filter(region, graph, filter_patterns: list[str]) -> bool: def region_pattern_autotuning_workflow( - model_path: str | onnx.ModelProto, + model_or_path: str | onnx.ModelProto, output_dir: Path | None = None, num_schemes_per_region: int = 30, pattern_cache_file: str | None = None, @@ -196,7 +197,7 @@ def region_pattern_autotuning_workflow( 7. Export final optimized model with best Q/DQ scheme for each pattern Args: - model_path: Path to ONNX model file to optimize + model_or_path: Path to ONNX model file to optimize output_dir: Directory for output files (state, logs, models). Created if it doesn't exist. num_schemes_per_region: Number of Q/DQ insertion schemes to test per region pattern. Higher values explore more configurations but take longer (default: 30) @@ -217,6 +218,7 @@ def region_pattern_autotuning_workflow( Returns: QDQAutotuner instance after autotuning """ + output_dir_is_temp = output_dir is None if not output_dir: output_dir = Path(tempfile.mkdtemp()) @@ -230,11 +232,11 @@ def region_pattern_autotuning_workflow( state_file = str(output_dir / "autotuner_state.yaml") state_path = Path(state_file) - if isinstance(model_path, str): - logger.info(f"Loading model: {model_path}") - model = onnx.load(model_path) + if isinstance(model_or_path, str): + logger.info(f"Loading model: {model_or_path}") + model = onnx.load(model_or_path) else: - model = model_path + model = model_or_path pattern_cache = None if pattern_cache_file: @@ -381,4 +383,9 @@ def region_pattern_autotuning_workflow( logger.debug(f" Logs: {logs_dir}") logger.debug(f" Region models: {models_dir}") + # Remove temporary folder + if output_dir_is_temp and output_dir.exists(): + shutil.rmtree(output_dir) + logger.info(f"Temporary directory {output_dir} was deleted!") + return autotuner diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index c8930283a..889493ee3 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -46,6 +46,7 @@ from modelopt.onnx.logging_config import configure_logging, logger from modelopt.onnx.op_types import is_data_dependent_shape_op +from modelopt.onnx.quantization.autotune import MODE_PRESETS from modelopt.onnx.quantization.autotune.workflows import ( init_benchmark_instance, region_pattern_autotuning_workflow, @@ -267,7 +268,7 @@ def _find_nodes_to_quantize_autotune( ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: logger.info("Running Auto Q/DQ with TensorRT") - init_benchmark_instance( + benchmark_instance = init_benchmark_instance( use_trtexec=use_trtexec, plugin_libraries=trt_plugins, timing_cache_file=timing_cache_file, @@ -275,8 +276,10 @@ def _find_nodes_to_quantize_autotune( timing_runs=timing_runs, trtexec_args=trtexec_args.split() if trtexec_args else None, ) - precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} + if benchmark_instance is None: + raise RuntimeError("Failed to initialize TensorRT benchmark") + precision_map = {"fp16": "float16", "fp32": "float32", "bf16": "bfloat16"} autotuner = region_pattern_autotuning_workflow( onnx_model, output_dir=Path(output_dir) if output_dir else None, @@ -327,7 +330,7 @@ def quantize( opset: int | None = None, autotune: bool = False, autotune_output_dir: str | None = None, - autotune_num_schemes_per_region: int = 30, + autotune_num_schemes_per_region: int = MODE_PRESETS["default"]["schemes_per_region"], autotune_pattern_cache_file: str | None = None, autotune_state_file: str | None = None, autotune_qdq_baseline: str | None = None, @@ -335,8 +338,8 @@ def quantize( autotune_verbose: bool = False, autotune_use_trtexec: bool = False, autotune_timing_cache: str | None = None, - autotune_warmup_runs: int = 5, - autotune_timing_runs: int = 20, + autotune_warmup_runs: int = MODE_PRESETS["default"]["warmup_runs"], + autotune_timing_runs: int = MODE_PRESETS["default"]["timing_runs"], autotune_trtexec_args: str | None = None, **kwargs: Any, ) -> None: @@ -464,6 +467,32 @@ def quantize( autotune: If True, detect optimal Q/DQ node placements according to the TensorRT version and platform available. If False, use the default pattern-based quantization approach. + autotune_output_dir: + Output directory for autotune results (state file, logs). Default: temp directory. + autotune_num_schemes_per_region: + Number of Q/DQ schemes to test per region. + autotune_pattern_cache_file: + Path to pattern cache YAML for warm-start. + autotune_qdq_baseline: + Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start. + autotune_state_file: + State file path for crash recovery and resume capability (default: /autotuner_state.yaml). + autotune_node_filter_list: + Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). Regions without + any matching nodes are skipped during autotuning. + autotune_verbose: + Enable verbose logging in the autotuner. + autotune_use_trtexec: + Use trtexec for benchmarking instead of the TensorRT Python API. + autotune_timing_cache: + TensorRT timing cache file for faster engine builds. + autotune_warmup_runs: + Number of warmup runs before timing. + autotune_timing_runs: + Number of timed runs for latency measurement. + autotune_trtexec_args: + Additional trtexec arguments as a single quoted string. + Example: --autotune_trtexec_args '--fp16 --workspace=4096' kwargs: Additional keyword arguments for int4 quantization, including: - awqlite_alpha_step (float): Alpha step for lite, range [0, 1]. diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py index 6e889e131..7dd8c8323 100644 --- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from unittest.mock import patch import onnx import onnx_graphsurgeon as gs from _test_utils.import_helper import skip_if_no_tensorrt from _test_utils.onnx.lib_test_models import export_as_onnx -from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_onnx_model +from _test_utils.onnx.quantization.autotune.models import _create_simple_resnet18_model from modelopt.onnx.quantization.autotune.workflows import ( init_benchmark_instance, @@ -53,8 +54,8 @@ def test_autotune_quantization_integration(tmp_path): - the autotuner's own export (via export_onnx), and - the quantize(autotune=True) output model. """ - model_torch, input_tensor = _create_simple_resnet18_onnx_model() - onnx_path = tmp_path / "model.onnx" + model_torch, input_tensor = _create_simple_resnet18_model() + onnx_path = os.path.join(tmp_path, "model.onnx") output_path = onnx_path.replace(".onnx", ".quant.onnx") # Export torch model to ONNX From 8634b74f9487e82adf1f97f917a155d75d4d7615 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 15:28:31 -0400 Subject: [PATCH 38/42] Address additional coderabbit and copilot issues Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 889493ee3..7bc6000b5 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -46,11 +46,6 @@ from modelopt.onnx.logging_config import configure_logging, logger from modelopt.onnx.op_types import is_data_dependent_shape_op -from modelopt.onnx.quantization.autotune import MODE_PRESETS -from modelopt.onnx.quantization.autotune.workflows import ( - init_benchmark_instance, - region_pattern_autotuning_workflow, -) from modelopt.onnx.quantization.calib_utils import ( CalibrationDataProvider, CalibrationDataType, @@ -254,7 +249,7 @@ def _find_nodes_to_quantize_autotune( trt_plugins: list[str] | None, high_precision_dtype: str = "fp16", output_dir: str | None = None, - num_schemes_per_region: int = 30, + num_schemes_per_region: int = 50, pattern_cache_file: str | None = None, state_file: str | None = None, qdq_baseline_model: str | None = None, @@ -262,12 +257,17 @@ def _find_nodes_to_quantize_autotune( verbose: bool = False, use_trtexec: bool = False, timing_cache_file: str | None = None, - warmup_runs: int = 5, - timing_runs: int = 20, + warmup_runs: int = 50, + timing_runs: int = 100, trtexec_args: str | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: - logger.info("Running Auto Q/DQ with TensorRT") + # Import Autotune dependencies here to avoid making 'tensorrt' and 'cuda' a module-level requirement. + from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, + ) + logger.info("Running Auto Q/DQ with TensorRT") benchmark_instance = init_benchmark_instance( use_trtexec=use_trtexec, plugin_libraries=trt_plugins, @@ -330,7 +330,7 @@ def quantize( opset: int | None = None, autotune: bool = False, autotune_output_dir: str | None = None, - autotune_num_schemes_per_region: int = MODE_PRESETS["default"]["schemes_per_region"], + autotune_num_schemes_per_region: int = 50, autotune_pattern_cache_file: str | None = None, autotune_state_file: str | None = None, autotune_qdq_baseline: str | None = None, @@ -338,8 +338,8 @@ def quantize( autotune_verbose: bool = False, autotune_use_trtexec: bool = False, autotune_timing_cache: str | None = None, - autotune_warmup_runs: int = MODE_PRESETS["default"]["warmup_runs"], - autotune_timing_runs: int = MODE_PRESETS["default"]["timing_runs"], + autotune_warmup_runs: int = 50, + autotune_timing_runs: int = 100, autotune_trtexec_args: str | None = None, **kwargs: Any, ) -> None: From 0d82f64f2cb1292d8032879d06a91ea568d403f7 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:09:32 -0400 Subject: [PATCH 39/42] Added real scales test in the integration workflow Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 15 ++++--- .../test_autotune_quantization_integration.py | 44 ++++++++++++++++--- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 7bc6000b5..0b50aff5f 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -46,6 +46,14 @@ from modelopt.onnx.logging_config import configure_logging, logger from modelopt.onnx.op_types import is_data_dependent_shape_op + +try: + from modelopt.onnx.quantization.autotune.workflows import ( + init_benchmark_instance, + region_pattern_autotuning_workflow, + ) +except ImportError: + logger.warning("Failed to import Autotune dependencies") from modelopt.onnx.quantization.calib_utils import ( CalibrationDataProvider, CalibrationDataType, @@ -261,13 +269,8 @@ def _find_nodes_to_quantize_autotune( timing_runs: int = 100, trtexec_args: str | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: - # Import Autotune dependencies here to avoid making 'tensorrt' and 'cuda' a module-level requirement. - from modelopt.onnx.quantization.autotune.workflows import ( - init_benchmark_instance, - region_pattern_autotuning_workflow, - ) - logger.info("Running Auto Q/DQ with TensorRT") + benchmark_instance = init_benchmark_instance( use_trtexec=use_trtexec, plugin_libraries=trt_plugins, diff --git a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py index 7dd8c8323..829eebb55 100644 --- a/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py +++ b/tests/gpu/onnx/quantization/test_autotune_quantization_integration.py @@ -42,15 +42,33 @@ def _quantized_tensor_indices(model: onnx.ModelProto) -> set[tuple[str, int]]: } +def _collect_q_scales(model: onnx.ModelProto) -> dict[str, float]: + """Return {scale_initializer_name: float_value} for every QuantizeLinear node. + + Works for both float32 and float16 scale initializers (the latter produced by + the fp16-conversion pass that runs after ORT calibration). + """ + initializers = {init.name: init for init in model.graph.initializer} + scales = {} + for node in model.graph.node: + if node.op_type == "QuantizeLinear" and len(node.input) >= 2: + scale_name = node.input[1] + if scale_name in initializers: + raw = onnx.numpy_helper.to_array(initializers[scale_name]) + scales[scale_name] = float(raw.flat[0]) + return scales + + def test_autotune_quantization_integration(tmp_path): """Ensure that the quantized tensors are the same for standalone Autotune and MOQ with Autotune. - Runs the autotuner once to obtain a fixed set of insertion points. The same - autotuner instance is then injected into quantize() via patching so that both - sides reflect identical placement decisions without a second TRT profiling run. + Also ensure that the scales in the Q/DQ nodes have been updated from standalone Autotune to MOQ with Autotune. - Compares the set of (node_name, input_index) pairs where a DQ node feeds the - input between: + Runs the autotuner once to obtain a fixed set of insertion points. The same autotuner instance is then injected + into quantize() via patching so that both sides reflect identical placement decisions without a second TRT + profiling run. + + Compares the set of (node_name, input_index) pairs where a DQ node feeds the input between: - the autotuner's own export (via export_onnx), and - the quantize(autotune=True) output model. """ @@ -93,5 +111,21 @@ def test_autotune_quantization_integration(tmp_path): ): quantize(onnx_path, autotune=True, output_path=output_path) + # Check Q/DQ nodes placement moq_tensors = _quantized_tensor_indices(onnx.load(output_path)) assert autotune_tensors == moq_tensors + + # Check Q/DQ scales + scales_random = _collect_q_scales(autotune_model) + scales_calib = _collect_q_scales(onnx.load(output_path)) + assert scales_random, "Expected at least one Q scale in the standalone Autotune model" + assert scales_calib, "Expected at least one Q scale in the MOQ + Autotune integrated model" + assert len(scales_random.keys()) == len(scales_calib.keys()), ( + "Both models must quantize the same number of tensor" + ) + assert all( + v != list(scales_calib.values())[idx] for idx, v in enumerate(scales_random.values()) + ), ( + "All or some Q/DQ scales are identical between the standalone Autotune and MOQ + Autotune integrated models. " + "The integrated quantization appears to have had no effect on scale computation." + ) From ee873304a636b93ca46f72eec26d42ec21527589 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:51:15 -0400 Subject: [PATCH 40/42] Address additional copilot issues: includes fix for op_types_to_quantize overwrite and other flags (should have the same behavior as pre-autotune) Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/quantize.py | 36 +++++++++---------- .../autotune/test_pattern_cache.py | 1 + .../onnx/quantization/autotune/test_region.py | 6 ---- 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index 0b50aff5f..bbc54a4c7 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -584,22 +584,21 @@ def quantize( # Check op types spelling in 'op_types_to_exclude' and '_to_quantize' validate_op_types_spelling(onnx_path, op_types_to_quantize, op_types_to_exclude) - if not autotune: - # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern. - # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to - # MatMuls in MHA pattern. - # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8 - # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern. - nodes_to_exclude = find_nodes_from_mha_to_exclude( - onnx_path, - use_external_data_format, - nodes_to_exclude, - disable_mha_qdq, - quantize_mode, - intermediate_generated_files, - calibration_data_reader, - calibration_eps, - ) + # (1) If disable_mha_qdq is set, don't add Q/DQ layers to MatMuls in MHA pattern. + # (2) else when quantize_mode == "int8", if seq_len > 512, don't add Q/DQ layers to + # MatMuls in MHA pattern. + # (3) else when quantize_mode == "fp8", if head_size > 256 or head_size <= 8 + # or mha doesn't meet fp8 fMHA v2 pattern, don't add Q/DQ layers to MatMuls in MHA pattern. + nodes_to_exclude = find_nodes_from_mha_to_exclude( + onnx_path, + use_external_data_format, + nodes_to_exclude, + disable_mha_qdq, + quantize_mode, + intermediate_generated_files, + calibration_data_reader, + calibration_eps, + ) if calibrate_per_node and not calibration_shapes: calibration_shapes = get_input_shapes(onnx_path) @@ -608,7 +607,7 @@ def quantize( if autotune: ( nodes_to_quantize_autotune, - op_types_to_quantize, + op_types_to_quantize_autotune, no_quantize_inputs, op_types_needing_output_quant, ) = _find_nodes_to_quantize_autotune( @@ -629,7 +628,8 @@ def quantize( timing_runs=autotune_timing_runs, trtexec_args=autotune_trtexec_args, ) - nodes_to_quantize.extend(nodes_to_quantize_autotune) + op_types_to_quantize = op_types_to_quantize or op_types_to_quantize_autotune + nodes_to_quantize = nodes_to_quantize or nodes_to_quantize_autotune kwargs["no_quantize_inputs"] = no_quantize_inputs kwargs["op_types_needing_output_quant"] = op_types_needing_output_quant diff --git a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py index 294501ff0..a2d61c507 100644 --- a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py +++ b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + """ Tests for PatternCache in the autotuner. diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py index 5a733017d..34e2cd244 100644 --- a/tests/unit/onnx/quantization/autotune/test_region.py +++ b/tests/unit/onnx/quantization/autotune/test_region.py @@ -13,12 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """Tests for the Region class in the autotuner.""" import pytest From 1a531b95e39b35033ee1004af9a99f6571179bd2 Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:59:04 -0400 Subject: [PATCH 41/42] nit: added docstring and comment Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- modelopt/onnx/quantization/fp8.py | 2 ++ modelopt/onnx/quantization/int8.py | 2 ++ modelopt/onnx/quantization/quantize.py | 1 + 3 files changed, 5 insertions(+) diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py index e181e1864..b7146173a 100755 --- a/modelopt/onnx/quantization/fp8.py +++ b/modelopt/onnx/quantization/fp8.py @@ -220,6 +220,8 @@ def quantize( # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. + # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements + # will be decided according to TensorRT's runtime measurements. logger.info("Detecting GEMV patterns for TRT optimization") matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude( onnx_path, diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py index 27c87abd4..ad2ca9558 100755 --- a/modelopt/onnx/quantization/int8.py +++ b/modelopt/onnx/quantization/int8.py @@ -162,6 +162,8 @@ def quantize( # Either of m or n in matmul is 1, this matmul cannot utilize TensorCores. # The perf of adding Q/DQ layers is not good in TRT. Thus, in this case, # do not add Q/DQ layers to this matmul. + # Note that this check will be skipped if Autotune is enabled as Q/DQ node placements + # will be decided according to TensorRT's runtime measurements. logger.info("Detecting GEMV patterns for TRT optimization") matmul_nodes_to_exclude = find_nodes_from_matmul_to_exclude( onnx_path, diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py index bbc54a4c7..b53904657 100755 --- a/modelopt/onnx/quantization/quantize.py +++ b/modelopt/onnx/quantization/quantize.py @@ -269,6 +269,7 @@ def _find_nodes_to_quantize_autotune( timing_runs: int = 100, trtexec_args: str | None = None, ) -> tuple[list[str], list[str], list[tuple[gs.Node, gs.Node, str]], list[str]]: + """Extracts quantization information from Autotune to provide ORT quantization.""" logger.info("Running Auto Q/DQ with TensorRT") benchmark_instance = init_benchmark_instance( From ede8df0118cdfc1c5d9df19bbc22f59ca50f94fe Mon Sep 17 00:00:00 2001 From: gcunhase <4861122+gcunhase@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:15:13 -0400 Subject: [PATCH 42/42] Created autotune utils Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> --- .../onnx/quantization/autotune/__init__.py | 5 +- .../onnx/quantization/autotune/__main__.py | 64 ++------------- modelopt/onnx/quantization/autotune/utils.py | 81 +++++++++++++++++++ 3 files changed, 89 insertions(+), 61 deletions(-) create mode 100644 modelopt/onnx/quantization/autotune/utils.py diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py index b00e8c8f9..74f44f972 100644 --- a/modelopt/onnx/quantization/autotune/__init__.py +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -20,8 +20,8 @@ region analysis to efficiently explore and optimize Q/DQ insertion strategies. """ -# Expose Autotune modes and args -from .__main__ import MODE_PRESETS, StoreWithExplicitFlag, get_node_filter_list +# Expose Autotune modes +from .__main__ import MODE_PRESETS # Core data structures from .autotuner import QDQAutotuner @@ -45,6 +45,7 @@ ) from .region_pattern import RegionPattern from .region_search import CombinedRegionSearch +from .utils import StoreWithExplicitFlag, get_node_filter_list __all__ = [ "MODE_PRESETS", diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index a66585300..071ba6ceb 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -21,6 +21,11 @@ from pathlib import Path from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.autotune.utils import ( + StoreWithExplicitFlag, + get_node_filter_list, + validate_file_path, +) from modelopt.onnx.quantization.autotune.workflows import ( init_benchmark_instance, region_pattern_autotuning_workflow, @@ -44,20 +49,6 @@ } -class StoreWithExplicitFlag(argparse.Action): - """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" - - def __init__(self, explicit_attr: str, *args, **kwargs): - """Initialize explicit attribute flag.""" - self._explicit_attr = explicit_attr - super().__init__(*args, **kwargs) - - def __call__(self, parser, namespace, values, option_string=None): - """Set attributes.""" - setattr(namespace, self.dest, values) - setattr(namespace, self._explicit_attr, True) - - def apply_mode_presets(args) -> None: """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs. @@ -75,30 +66,6 @@ def apply_mode_presets(args) -> None: args.timing_runs = preset["timing_runs"] -def validate_file_path(path: str | None, description: str) -> Path | None: - """Validate that a file path exists. - - Args: - path: Path string to validate (can be None) - description: Description of the file for error messages - - Returns: - Path object if valid, None if path is None - - Raises: - SystemExit: If path is provided but doesn't exist - """ - if path is None: - return None - - path_obj = Path(path) - if not path_obj.exists(): - logger.error(f"{description} not found: {path_obj}") - sys.exit(1) - - return path_obj - - def log_benchmark_config(args): """Log TensorRT benchmark configuration for transparency. @@ -118,27 +85,6 @@ def log_benchmark_config(args): logger.info(f" Trtexec args: {args.trtexec_benchmark_args}") -def get_node_filter_list(node_filter_list_path: str) -> list | None: - """Extract node filter list from node filters path. - - Args: - node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). - - Returns: - Node filter list - """ - node_filter_list = None - if node_filter_list_path: - filter_file = validate_file_path(node_filter_list_path, "Node filter list file") - if filter_file: - with open(filter_file) as f: - node_filter_list = [ - line.strip() for line in f if line.strip() and not line.strip().startswith("#") - ] - logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") - return node_filter_list - - def run_autotune() -> int: """Execute the complete pattern-based Q/DQ autotuning workflow. diff --git a/modelopt/onnx/quantization/autotune/utils.py b/modelopt/onnx/quantization/autotune/utils.py new file mode 100644 index 000000000..8760b4bc1 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/utils.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions related to Autotune.""" + +import argparse +import sys +from pathlib import Path + +from modelopt.onnx.logging_config import logger + + +class StoreWithExplicitFlag(argparse.Action): + """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" + + def __init__(self, explicit_attr: str, *args, **kwargs): + """Initialize explicit attribute flag.""" + self._explicit_attr = explicit_attr + super().__init__(*args, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + """Set attributes.""" + setattr(namespace, self.dest, values) + setattr(namespace, self._explicit_attr, True) + + +def validate_file_path(path: str | None, description: str) -> Path | None: + """Validate that a file path exists. + + Args: + path: Path string to validate (can be None) + description: Description of the file for error messages + + Returns: + Path object if valid, None if path is None + + Raises: + SystemExit: If path is provided but doesn't exist + """ + if path is None: + return None + + path_obj = Path(path) + if not path_obj.exists(): + logger.error(f"{description} not found: {path_obj}") + sys.exit(1) + + return path_obj + + +def get_node_filter_list(node_filter_list_path: str) -> list | None: + """Extract node filter list from node filters path. + + Args: + node_filter_list_path: Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). + + Returns: + Node filter list + """ + node_filter_list = None + if node_filter_list_path: + filter_file = validate_file_path(node_filter_list_path, "Node filter list file") + if filter_file: + with open(filter_file) as f: + node_filter_list = [ + line.strip() for line in f if line.strip() and not line.strip().startswith("#") + ] + logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}") + return node_filter_list