Adding new setting, autotune_effort=[none/quick/full] (#913)

choijon5 · jansel · web-flow · commit c8f83fbc9f89 · 2025-10-13T16:27:29.000-07:00
Co-authored-by: Jason Ansel &lt;jansel@meta.com&gt;
diff --git a/docs/api/autotuner.md b/docs/api/autotuner.md
@@ -2,6 +2,8 @@
 
 The `helion.autotuner` module provides automatic optimization of kernel configurations.
 
+Autotuning effort can be adjusted via :attr:`helion.Settings.autotune_effort`, which configures how much each algorithm explores (``"none"`` disables autotuning, ``"quick"`` runs a smaller search, ``"full"`` uses the full search budget). Users may still override individual autotuning parameters if they need finer control.
+
 ```{eval-rst}
 .. currentmodule:: helion.autotuner
 
diff --git a/docs/api/kernel.md b/docs/api/kernel.md
@@ -155,6 +155,7 @@ Settings control **how the kernel is compiled** and the development environment:
 @helion.kernel(
     # Settings parameters
     use_default_config=True,      # Skip autotuning for development
+    autotune_effort="quick",     # Smaller autotuning budget when search is enabled
     print_output_code=True,       # Debug: show generated Triton code
     static_shapes=True,           # Compilation optimization strategy
     autotune_log_level=logging.DEBUG  # Verbose autotuning output
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -160,6 +160,18 @@ with helion.set_default_settings(
 .. autoattribute:: Settings.autotune_config_overrides
 
    Dict of config key/value pairs to force during autotuning. Useful for disabling problematic candidates or pinning experimental options.
+
+.. autoattribute:: Settings.autotune_effort
+
+   Select the autotuning effort preset. Available values:
+
+   - ``"none"`` – skip autotuning and run the default configuration (equivalent to ``use_default_config=True``).
+   - ``"quick"`` – limited search for faster runs with decent performance.
+   - ``"full"`` – exhaustive autotuning (current default behavior).
+
+   Users can still override individual ``autotune_*`` settings; explicit values win over the preset. Controlled by ``HELION_AUTOTUNE_EFFORT``.
+
+
 ```
 
 ### Autotuning Cache
diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
@@ -9,6 +9,10 @@
 from .differential_evolution import (
     DifferentialEvolutionSearch as DifferentialEvolutionSearch,
 )
+from .effort_profile import AutotuneEffortProfile as AutotuneEffortProfile
+from .effort_profile import DifferentialEvolutionConfig as DifferentialEvolutionConfig
+from .effort_profile import PatternSearchConfig as PatternSearchConfig
+from .effort_profile import RandomSearchConfig as RandomSearchConfig
 from .finite_search import FiniteSearch as FiniteSearch
 from .local_cache import LocalAutotuneCache as LocalAutotuneCache
 from .local_cache import StrictLocalAutotuneCache as StrictLocalAutotuneCache
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -550,10 +550,9 @@ def should_rebenchmark(self, member: PopulationMember) -> bool:
         Returns:
             True if the member should be re-benchmarked, False otherwise.
         """
-        return (
+        threshold = self.settings.get_rebenchmark_threshold()
+        return member.perf < threshold * self.best_perf_so_far and math.isfinite(
             member.perf
-            < self.settings.autotune_rebenchmark_threshold * self.best_perf_so_far
-            and math.isfinite(member.perf)
         )
 
     def rebenchmark(
@@ -568,7 +567,14 @@ def rebenchmark(
         """
         if len(members) < 2:
             return
-        repeat = min(1000, max(3, int(200 / self.best_perf_so_far)))
+
+        # Calculate repeat count based on best performance
+        base_repeat = (
+            int(200 / self.best_perf_so_far)
+            if math.isfinite(self.best_perf_so_far) and self.best_perf_so_far > 0
+            else 1000
+        )
+        repeat = min(1000, max(3, base_repeat))
         iterator = [functools.partial(m.fn, *self.args) for m in members]
         if self.settings.autotune_progress_bar:
             new_timings = interleaved_bench(iterator, repeat=repeat, desc=desc)
diff --git a/helion/autotuner/differential_evolution.py b/helion/autotuner/differential_evolution.py
@@ -8,6 +8,7 @@
 from .base_search import PopulationMember
 from .base_search import performance
 from .base_search import population_statistics
+from .effort_profile import DIFFERENTIAL_EVOLUTION_DEFAULTS
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
@@ -26,8 +27,8 @@ def __init__(
         self,
         kernel: BoundKernel,
         args: Sequence[object],
-        population_size: int = 40,
-        max_generations: int = 40,
+        population_size: int = DIFFERENTIAL_EVOLUTION_DEFAULTS.population_size,
+        max_generations: int = DIFFERENTIAL_EVOLUTION_DEFAULTS.max_generations,
         crossover_rate: float = 0.8,
         immediate_update: bool | None = None,
     ) -> None:
diff --git a/helion/autotuner/effort_profile.py b/helion/autotuner/effort_profile.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+AutotuneEffort = Literal["none", "quick", "full"]
+
+
+@dataclass(frozen=True)
+class PatternSearchConfig:
+    initial_population: int
+    copies: int
+    max_generations: int
+
+
+@dataclass(frozen=True)
+class DifferentialEvolutionConfig:
+    population_size: int
+    max_generations: int
+
+
+@dataclass(frozen=True)
+class RandomSearchConfig:
+    count: int
+
+
+# Default values for each algorithm (single source of truth)
+PATTERN_SEARCH_DEFAULTS = PatternSearchConfig(
+    initial_population=100,
+    copies=5,
+    max_generations=20,
+)
+
+DIFFERENTIAL_EVOLUTION_DEFAULTS = DifferentialEvolutionConfig(
+    population_size=40,
+    max_generations=40,
+)
+
+RANDOM_SEARCH_DEFAULTS = RandomSearchConfig(
+    count=1000,
+)
+
+
+@dataclass(frozen=True)
+class AutotuneEffortProfile:
+    pattern_search: PatternSearchConfig | None
+    differential_evolution: DifferentialEvolutionConfig | None
+    random_search: RandomSearchConfig | None
+    rebenchmark_threshold: float = 1.5
+
+
+_PROFILES: dict[AutotuneEffort, AutotuneEffortProfile] = {
+    "none": AutotuneEffortProfile(
+        pattern_search=None,
+        differential_evolution=None,
+        random_search=None,
+    ),
+    "quick": AutotuneEffortProfile(
+        pattern_search=PatternSearchConfig(
+            initial_population=30,
+            copies=2,
+            max_generations=5,
+        ),
+        differential_evolution=DifferentialEvolutionConfig(
+            population_size=20,
+            max_generations=8,
+        ),
+        random_search=RandomSearchConfig(
+            count=100,
+        ),
+        rebenchmark_threshold=0.9,  # <1.0 effectively disables rebenchmarking
+    ),
+    "full": AutotuneEffortProfile(
+        pattern_search=PATTERN_SEARCH_DEFAULTS,
+        differential_evolution=DIFFERENTIAL_EVOLUTION_DEFAULTS,
+        random_search=RANDOM_SEARCH_DEFAULTS,
+    ),
+}
+
+
+def get_effort_profile(effort: AutotuneEffort) -> AutotuneEffortProfile:
+    return _PROFILES[effort]
diff --git a/helion/autotuner/pattern_search.py b/helion/autotuner/pattern_search.py
@@ -8,6 +8,7 @@
 from .base_search import PopulationBasedSearch
 from .base_search import PopulationMember
 from .base_search import performance
+from .effort_profile import PATTERN_SEARCH_DEFAULTS
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
@@ -25,9 +26,9 @@ def __init__(
         kernel: BoundKernel,
         args: Sequence[object],
         *,
-        initial_population: int = 100,
-        copies: int = 5,
-        max_generations: int = 20,
+        initial_population: int = PATTERN_SEARCH_DEFAULTS.initial_population,
+        copies: int = PATTERN_SEARCH_DEFAULTS.copies,
+        max_generations: int = PATTERN_SEARCH_DEFAULTS.max_generations,
         min_improvement_delta: float = 0.001,
     ) -> None:
         """
diff --git a/helion/autotuner/random_search.py b/helion/autotuner/random_search.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 
 from .config_generation import ConfigGeneration
+from .effort_profile import RANDOM_SEARCH_DEFAULTS
 from .finite_search import FiniteSearch
 
 if TYPE_CHECKING:
@@ -31,7 +32,7 @@ def __init__(
         self,
         kernel: BoundKernel,
         args: Sequence[object],
-        count: int = 1000,
+        count: int = RANDOM_SEARCH_DEFAULTS.count,
     ) -> None:
         super().__init__(
             kernel,
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -571,7 +571,7 @@ def _implicit_config(self) -> Config | None:
             return self._config
         if len(configs) == 1:
             return configs[0]
-        if len(configs) == 0 and self.kernel.settings.use_default_config:
+        if len(configs) == 0 and self.kernel.settings.autotune_effort == "none":
             config = self.config_spec.default_config()
             if not is_ref_mode_enabled(self.kernel.settings):
                 kernel_decorator = self.format_kernel_decorator(config, self.settings)
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -16,6 +16,8 @@
 from torch._environment import is_fbcode
 
 from helion import exc
+from helion.autotuner.effort_profile import AutotuneEffort
+from helion.autotuner.effort_profile import get_effort_profile
 from helion.runtime.ref_mode import RefMode
 
 if TYPE_CHECKING:
@@ -81,6 +83,27 @@ def default_autotuner_fn(
                 "max_generations", bound_kernel.settings.autotune_max_generations
             )
 
+    profile = get_effort_profile(bound_kernel.settings.autotune_effort)
+
+    if autotuner_cls.__name__ == "PatternSearch":
+        assert profile.pattern_search is not None
+        kwargs.setdefault(
+            "initial_population", profile.pattern_search.initial_population
+        )
+        kwargs.setdefault("copies", profile.pattern_search.copies)
+        kwargs.setdefault("max_generations", profile.pattern_search.max_generations)
+    elif autotuner_cls.__name__ == "DifferentialEvolutionSearch":
+        assert profile.differential_evolution is not None
+        kwargs.setdefault(
+            "population_size", profile.differential_evolution.population_size
+        )
+        kwargs.setdefault(
+            "max_generations", profile.differential_evolution.max_generations
+        )
+    elif autotuner_cls.__name__ == "RandomSearch":
+        assert profile.random_search is not None
+        kwargs.setdefault("count", profile.random_search.count)
+
     return LocalAutotuneCache(autotuner_cls(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
 
 
@@ -98,6 +121,17 @@ def _get_autotune_max_generations() -> int | None:
     return None
 
 
+def _get_autotune_rebenchmark_threshold() -> float | None:
+    value = os.environ.get("HELION_REBENCHMARK_THRESHOLD")
+    if value is not None:
+        return float(value)
+    return None  # Will use effort profile default
+
+
+def _get_autotune_effort() -> AutotuneEffort:
+    return cast("AutotuneEffort", os.environ.get("HELION_AUTOTUNE_EFFORT", "full"))
+
+
 @dataclasses.dataclass
 class _Settings:
     # see __slots__ below for the doc strings that show up in help(Settings)
@@ -110,7 +144,6 @@ class _Settings:
         os.environ.get("TRITON_F32_DEFAULT", "tf32"),
     )
     static_shapes: bool = False
-    use_default_config: bool = os.environ.get("HELION_USE_DEFAULT_CONFIG", "0") == "1"
     autotune_log_level: int = logging.INFO
     autotune_compile_timeout: int = int(
         os.environ.get("HELION_AUTOTUNE_COMPILE_TIMEOUT", "60")
@@ -123,8 +156,8 @@ class _Settings:
     autotune_accuracy_check: bool = (
         os.environ.get("HELION_AUTOTUNE_ACCURACY_CHECK", "1") == "1"
     )
-    autotune_rebenchmark_threshold: float = float(
-        os.environ.get("HELION_REBENCHMARK_THRESHOLD", "1.5")
+    autotune_rebenchmark_threshold: float | None = dataclasses.field(
+        default_factory=_get_autotune_rebenchmark_threshold
     )
     autotune_progress_bar: bool = (
         os.environ.get("HELION_AUTOTUNE_PROGRESS_BAR", "1") == "1"
@@ -137,6 +170,9 @@ class _Settings:
     autotune_config_overrides: dict[str, object] = dataclasses.field(
         default_factory=dict
     )
+    autotune_effort: AutotuneEffort = dataclasses.field(
+        default_factory=_get_autotune_effort
+    )
     allow_warp_specialize: bool = (
         os.environ.get("HELION_ALLOW_WARP_SPECIALIZE", "1") == "1"
     )
@@ -153,19 +189,18 @@ class Settings(_Settings):
     compilation process. Unlike a Config, settings are not auto-tuned and set by the user.
     """
 
-    __slots__: dict[str, str] = {
+    __slots__ = {
         "ignore_warnings": "Subtypes of exc.BaseWarning to ignore when compiling.",
         "index_dtype": "The dtype to use for index variables. Default is torch.int32.",
         "dot_precision": "Precision for dot products, see `triton.language.dot`. Can be 'tf32', 'tf32x3', or 'ieee'.",
         "static_shapes": "If True, use static shapes for all tensors. This is a performance optimization.",
-        "use_default_config": "For development only, skips all autotuning and uses the default config (which may be slow).",
         "autotune_log_level": "Log level for autotuning using Python logging levels. Default is logging.INFO. Use 0 to disable all output.",
         "autotune_compile_timeout": "Timeout for Triton compilation in seconds used for autotuning. Default is 60 seconds.",
         "autotune_precompile": "If True, precompile the kernel before autotuning. Requires fork-safe environment.",
         "autotune_precompile_jobs": "Maximum concurrent Triton precompile processes, default to cpu count.",
         "autotune_random_seed": "Seed used for autotuner random number generation. Defaults to HELION_AUTOTUNE_RANDOM_SEED or a time-based seed.",
         "autotune_accuracy_check": "If True, validate candidate configs against the baseline kernel output before accepting them during autotuning.",
-        "autotune_rebenchmark_threshold": "If a config is within threshold*best_perf, re-benchmark it to avoid outliers. Default is 1.5x.  Set to <1 to disable.",
+        "autotune_rebenchmark_threshold": "If a config is within threshold*best_perf, re-benchmark it to avoid outliers. Defaults to effort profile value. Set HELION_REBENCHMARK_THRESHOLD to override.",
         "autotune_progress_bar": "If True, show progress bar during autotuning. Default is True. Set HELION_AUTOTUNE_PROGRESS_BAR=0 to disable.",
         "autotune_max_generations": "Override the maximum number of generations for Pattern Search and Differential Evolution Search autotuning algorithms with HELION_AUTOTUNE_MAX_GENERATIONS=N or @helion.kernel(autotune_max_generations=N).",
         "print_output_code": "If True, print the output code of the kernel to stderr.",
@@ -175,8 +210,8 @@ class Settings(_Settings):
         "debug_dtype_asserts": "If True, emit tl.static_assert checks for dtype after each device node.",
         "ref_mode": "Reference mode for kernel execution. Can be RefMode.OFF or RefMode.EAGER.",
         "autotuner_fn": "Function to create an autotuner",
+        "autotune_effort": "Autotuning effort preset. One of 'none', 'quick', 'full'.",
     }
-    assert __slots__.keys() == {field.name for field in dataclasses.fields(_Settings)}
 
     def __init__(self, **settings: object) -> None:
         """
@@ -187,6 +222,14 @@ def __init__(self, **settings: object) -> None:
             settings: Keyword arguments representing various settings.
         """
 
+        # Translate use_default_config to autotune_effort='none' for backward compatibility
+        if (
+            settings.get("use_default_config")
+            or os.environ.get("HELION_USE_DEFAULT_CONFIG") == "1"
+        ):
+            settings.setdefault("autotune_effort", "none")
+        settings.pop("use_default_config", None)
+
         if defaults := getattr(_tls, "default_settings", None):
             settings = {**defaults.to_dict(), **settings}
 
@@ -207,7 +250,13 @@ def shallow_copy(x: object) -> object:
                 return x.copy()
             return x
 
-        return {k: shallow_copy(v) for k, v in dataclasses.asdict(self).items()}
+        # Only include fields that are meant to be public (repr=True)
+        public_fields = {f.name for f in dataclasses.fields(self) if f.repr}
+        return {
+            k: shallow_copy(v)
+            for k, v in dataclasses.asdict(self).items()
+            if k in public_fields
+        }
 
     def check_autotuning_disabled(self) -> None:
         msg = None
@@ -223,6 +272,21 @@ def check_autotuning_disabled(self) -> None:
         if msg:
             raise exc.AutotuningDisallowedInEnvironment(msg)
 
+    def get_rebenchmark_threshold(self) -> float:
+        """
+        Get the effective rebenchmark threshold.
+        Uses the explicit setting if provided, otherwise falls back to the effort profile default.
+
+        Returns:
+            float: The rebenchmark threshold value.
+        """
+        if self.autotune_rebenchmark_threshold is not None:
+            return self.autotune_rebenchmark_threshold
+
+        from ..autotuner.effort_profile import get_effort_profile
+
+        return get_effort_profile(self.autotune_effort).rebenchmark_threshold
+
     def _check_ref_eager_mode_before_print_output_code(self) -> None:
         """
         Check if ref eager mode is enabled before printing output code. If ref eager mode is enabled, raise an error.
diff --git a/test/test_autotuner.py b/test/test_autotuner.py