Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
3554ecf
Initial autotune codebase
gcunhase Jan 24, 2026
56a67e3
Add more tests
gcunhase Jan 26, 2026
6d58b4a
Refactor: PR #702
gcunhase Jan 26, 2026
710319a
Remove python path in tests
gcunhase Jan 26, 2026
98a60b5
Recover docstrings and simplify code (->, , )
gcunhase Jan 27, 2026
91cef9c
Added unittest for workflows.py (failing)
gcunhase Jan 27, 2026
7937cc2
Fix: 'Autotuning failed: 'PatternSchemes' object has no attribute 'no…
gcunhase Jan 27, 2026
7c4e14b
Updated workflow test to test TRT and PythonTRT benchmarking
gcunhase Jan 27, 2026
64836ed
Fix test: use_trtexec flag
gcunhase Jan 28, 2026
a9af36a
Add real scales to Q/DQ nodes
gcunhase Feb 19, 2026
29e8dd2
fix precommit failures
gcunhase Feb 24, 2026
7f69882
Fix: Add->Q/DQ->Activation(Relu)
gcunhase Feb 24, 2026
bb030be
Fix: correctly dequantize Add input with shared Q/DQ
gcunhase Feb 24, 2026
616227c
[5916893] Fix weighted ops quantization logic: both input and weights…
gcunhase Feb 24, 2026
afee0a4
Changed keep_output_dir to True as default
gcunhase Mar 2, 2026
faf0bbb
test_workflow was moved to 'tests/gpu/onnx'
gcunhase Mar 2, 2026
08bf713
Removed cli.py, moved into __main__.py
gcunhase Mar 2, 2026
81fce48
Removed PatternSchemes import from region_pattern.py: no longer needed.
gcunhase Mar 2, 2026
7a57b8d
Added intermediate Autotune model to be removed at the end of the qua…
gcunhase Mar 2, 2026
a71fc91
Removed _MUTATION_SPECS from autotuner.py: moved to autotuner_base.py
gcunhase Mar 2, 2026
01e8be0
Removed test_config and test_pattern_cache. Should be added in the or…
gcunhase Mar 2, 2026
ad7a60d
Fixed minor coderabbit suggestions
gcunhase Mar 2, 2026
7589668
Moved autotune imports to the top of the file
gcunhase Mar 2, 2026
db4c3ef
Eliminate intermediate ONNX export in _find_nodes_to_quantize_autotun…
gcunhase Mar 3, 2026
42a0bdf
Add support for Add->Q/DQ->Relu patterns by including those 'Add' nod…
gcunhase Mar 3, 2026
a70dbd3
Add integration test
gcunhase Mar 3, 2026
e1c8af7
Remove 'keep_output_dir' arg (no longer needed due to tmp path)
gcunhase Mar 3, 2026
159b9f2
Remove 'get_quantized_nodes' and other comments that are no longer ne…
gcunhase Mar 3, 2026
51df982
Added docstring for 'default_dq_dtype' in workflows.py
gcunhase Mar 3, 2026
1dc03cd
Added mode presets and additional autotune configurations
gcunhase Mar 5, 2026
ddacbcb
Fixed tmp_path in test
gcunhase Mar 5, 2026
689a907
Fixed copilot comments
gcunhase Mar 5, 2026
b64322f
Fix: skip rewiring in graph_utils if no index is found. This prevents…
gcunhase Mar 5, 2026
0a32bea
Match args for preset mode default
gcunhase Mar 5, 2026
7730b51
Exposed _StoreWithExplicitFlag
gcunhase Mar 9, 2026
eb0e064
Renamed new_ips to new_insertion_points
gcunhase Mar 9, 2026
7cc54a5
Address coderabbit and copilot issues + other minor issues
gcunhase Mar 9, 2026
8634b74
Address additional coderabbit and copilot issues
gcunhase Mar 9, 2026
0d82f64
Added real scales test in the integration workflow
gcunhase Mar 9, 2026
ee87330
Address additional copilot issues: includes fix for op_types_to_quant…
gcunhase Mar 9, 2026
1a531b9
nit: added docstring and comment
gcunhase Mar 9, 2026
ede8df0
Created autotune utils
gcunhase Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions modelopt/onnx/op_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,3 +386,25 @@ def get_symmetric_ops():
"BitwiseOr",
"BitwiseXor",
}


def get_activation_ops():
"""Returns set of activation operations."""
return {
"Relu",
"LeakyRelu",
"PRelu",
"Elu",
"Selu",
"ThresholdedRelu",
"Sigmoid",
"Tanh",
"HardSigmoid",
"Softmax",
"LogSoftmax",
"Clip",
"Softplus",
"Softsign",
"Swish",
"HardSwish",
}
145 changes: 145 additions & 0 deletions modelopt/onnx/quantization/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@

import numpy as np

from modelopt.onnx.quantization.autotune import (
MODE_PRESETS,
StoreWithExplicitFlag,
get_node_filter_list,
)
from modelopt.onnx.quantization.quantize import quantize

__all__ = ["main"]
Expand Down Expand Up @@ -295,9 +300,128 @@ def get_parser() -> argparse.ArgumentParser:
"if certain operations require a higher version."
),
)
argparser.add_argument(
"--autotune",
nargs="?",
const="default",
default=None,
choices=["quick", "default", "extensive"],
help=(
"If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. "
"Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
" - 'quick': fewer schemes and benchmark runs for quick exploration; "
" - 'default': balanced, recommended for most cases; "
" - 'extensive': more schemes and runs for extensive search and thorough tuning. "
"Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
),
)

autotune_group = argparser.add_argument_group(
"Autotune (only applicable when --autotune is set)"
)
autotune_group.add_argument(
"--autotune_output_dir",
type=str,
default=None,
help="Output directory for autotune results (state file, logs). Default: temp directory.",
)
autotune_group.add_argument(
"--autotune_schemes_per_region",
type=int,
default=MODE_PRESETS["default"]["schemes_per_region"],
help="Number of Q/DQ schemes to test per region.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_schemes_per_region",
)
autotune_group.add_argument(
"--autotune_pattern_cache",
type=str,
default=None,
dest="autotune_pattern_cache_file",
help="Path to pattern cache YAML for warm-start.",
)
autotune_group.add_argument(
"--autotune_qdq_baseline",
type=str,
default=None,
help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.",
)
autotune_group.add_argument(
"--autotune_state_file",
type=str,
default=None,
help="State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).",
)
autotune_group.add_argument(
"--autotune_node_filter_list",
type=str,
default=None,
help=(
"Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
"Regions without any matching nodes are skipped during autotuning."
),
)
autotune_group.add_argument(
"--autotune_verbose",
action="store_true",
help="Enable verbose logging in the autotuner.",
)
autotune_group.add_argument(
"--autotune_use_trtexec",
action="store_true",
help="Use trtexec for benchmarking instead of the TensorRT Python API.",
)
autotune_group.add_argument(
"--autotune_timing_cache",
type=str,
default=None,
help="TensorRT timing cache file for faster engine builds.",
)
autotune_group.add_argument(
"--autotune_warmup_runs",
type=int,
default=MODE_PRESETS["default"]["warmup_runs"],
help="Number of warmup runs before timing.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_warmup_runs",
)
autotune_group.add_argument(
"--autotune_timing_runs",
type=int,
default=MODE_PRESETS["default"]["timing_runs"],
help="Number of timed runs for latency measurement.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_timing_runs",
)
autotune_group.add_argument(
"--autotune_trtexec_args",
type=str,
default=None,
help=(
"Additional trtexec arguments as a single quoted string. "
"Example: --autotune_trtexec_args '--fp16 --workspace=4096'"
),
)
return argparser


def apply_mode_presets(args) -> None:
"""Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs.

Only applies preset for an option when that option was not explicitly set on the
command line (explicit flags override the preset).
"""
if args.autotune not in MODE_PRESETS:
return
preset = MODE_PRESETS[args.autotune]
if not getattr(args, "_explicit_autotune_schemes_per_region", False):
args.autotune_schemes_per_region = preset["schemes_per_region"]
if not getattr(args, "_explicit_autotune_warmup_runs", False):
args.autotune_warmup_runs = preset["warmup_runs"]
if not getattr(args, "_explicit_autotune_timing_runs", False):
args.autotune_timing_runs = preset["timing_runs"]


def main():
"""Command-line entrypoint for ONNX PTQ."""
args = get_parser().parse_args()
Expand Down Expand Up @@ -331,6 +455,14 @@ def main():
else:
raise

# Autotune configs
autotune_enabled = args.autotune is not None
if autotune_enabled:
apply_mode_presets(args)
autotune_node_filter_list = (
get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None
)

quantize(
args.onnx_path,
quantize_mode=args.quantize_mode,
Expand Down Expand Up @@ -362,6 +494,19 @@ def main():
calibrate_per_node=args.calibrate_per_node,
direct_io_types=args.direct_io_types,
opset=args.opset,
autotune=autotune_enabled,
autotune_output_dir=args.autotune_output_dir,
autotune_num_schemes_per_region=args.autotune_schemes_per_region,
autotune_pattern_cache_file=args.autotune_pattern_cache_file,
autotune_state_file=args.autotune_state_file,
autotune_qdq_baseline=args.autotune_qdq_baseline,
autotune_node_filter_list=autotune_node_filter_list,
autotune_verbose=args.autotune_verbose,
autotune_use_trtexec=args.autotune_use_trtexec,
autotune_timing_cache=args.autotune_timing_cache,
autotune_warmup_runs=args.autotune_warmup_runs,
autotune_timing_runs=args.autotune_timing_runs,
autotune_trtexec_args=args.autotune_trtexec_args,
)


Expand Down
7 changes: 7 additions & 0 deletions modelopt/onnx/quantization/autotune/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
region analysis to efficiently explore and optimize Q/DQ insertion strategies.
"""

# Expose Autotune modes
from .__main__ import MODE_PRESETS

# Core data structures
from .autotuner import QDQAutotuner
from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
Expand All @@ -42,8 +45,10 @@
)
from .region_pattern import RegionPattern
from .region_search import CombinedRegionSearch
from .utils import StoreWithExplicitFlag, get_node_filter_list

__all__ = [
"MODE_PRESETS",
"AutotunerError",
"AutotunerNotInitializedError",
"ChildRegionInputInsertionPoint",
Expand All @@ -60,6 +65,8 @@
"RegionPattern",
"RegionType",
"ResolvedInsertionPoint",
"StoreWithExplicitFlag",
"TensorRTPyBenchmark",
"TrtExecBenchmark",
"get_node_filter_list",
]
62 changes: 10 additions & 52 deletions modelopt/onnx/quantization/autotune/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
from pathlib import Path

from modelopt.onnx.logging_config import logger
from modelopt.onnx.quantization.autotune.utils import (
StoreWithExplicitFlag,
get_node_filter_list,
validate_file_path,
)
from modelopt.onnx.quantization.autotune.workflows import (
init_benchmark_instance,
region_pattern_autotuning_workflow,
Expand All @@ -44,18 +49,6 @@
}


class _StoreWithExplicitFlag(argparse.Action):
"""Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""

def __init__(self, explicit_attr: str, *args, **kwargs):
self._explicit_attr = explicit_attr
super().__init__(*args, **kwargs)

def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
setattr(namespace, self._explicit_attr, True)


def apply_mode_presets(args) -> None:
"""Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.

Expand All @@ -73,30 +66,6 @@ def apply_mode_presets(args) -> None:
args.timing_runs = preset["timing_runs"]


def validate_file_path(path: str | None, description: str) -> Path | None:
"""Validate that a file path exists.

Args:
path: Path string to validate (can be None)
description: Description of the file for error messages

Returns:
Path object if valid, None if path is None

Raises:
SystemExit: If path is provided but doesn't exist
"""
if path is None:
return None

path_obj = Path(path)
if not path_obj.exists():
logger.error(f"{description} not found: {path_obj}")
sys.exit(1)

return path_obj


def log_benchmark_config(args):
"""Log TensorRT benchmark configuration for transparency.

Expand Down Expand Up @@ -155,20 +124,9 @@ def run_autotune() -> int:
return 1

try:
node_filter_list = None
if args.node_filter_list:
filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
if filter_file:
with open(filter_file) as f:
node_filter_list = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")

node_filter_list = get_node_filter_list(args.node_filter_list)
region_pattern_autotuning_workflow(
model_path=str(model_path),
model_or_path=str(model_path),
output_dir=output_dir,
num_schemes_per_region=args.num_schemes,
pattern_cache_file=args.pattern_cache_file,
Expand Down Expand Up @@ -262,7 +220,7 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
type=int,
default=DEFAULT_NUM_SCHEMES,
dest="num_schemes",
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_num_schemes",
help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
)
Expand Down Expand Up @@ -328,15 +286,15 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
"--warmup_runs",
type=int,
default=DEFAULT_WARMUP_RUNS,
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_warmup_runs",
help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
)
trt_group.add_argument(
"--timing_runs",
type=int,
default=DEFAULT_TIMING_RUNS,
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_timing_runs",
help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
)
Expand Down
Loading
Loading