From 97e49a49b0220086698a317f78d6d1e8d97fef11 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Thu, 23 Apr 2026 15:27:04 -0700
Subject: [PATCH] Right-size operator registry from selective build metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The runtime kernel registry is a fixed-size static array whose default
capacity (MAX_KERNEL_NUM=2000) permanently occupies ~48 KiB of BSS even
when only a handful of kernels are actually registered. When selective
build is active the exact set of kernels needed by the model is already
known at build time, so the registry can be sized to fit.

A new codegen tool (gen_max_kernel_num.py) counts the (op, kernel_key)
tuples in selected_operators.yaml, adds the prim ops registered by
register_prim_ops.cpp, and writes the total into a generated header.
operator_registry.cpp picks the header up via __has_include; a
user-supplied -DMAX_KERNEL_NUM still takes precedence, and builds that
don't use selective build keep the 2000 default.

On examples/selective_build/basic with two selected ops, the registry's
BSS footprint drops from 48000 B (2000 slots) to 840 B (35 slots) — a
~47 KiB reduction — with no additional flag required.

Issue: #18618

Co-authored-by: Claude <noreply@anthropic.com>
---
 CMakeLists.txt                                |  26 ++
 codegen/tools/gen_max_kernel_num.py           | 174 ++++++++++++++
 codegen/tools/test/test_gen_max_kernel_num.py | 226 ++++++++++++++++++
 docs/source/kernel-library-selective-build.md |  35 +++
 .../selective_build/test_selective_build.sh   |  13 +
 runtime/kernel/operator_registry.cpp          |  17 +-
 tools/cmake/Codegen.cmake                     |  63 +++++
 7 files changed, 552 insertions(+), 2 deletions(-)
 create mode 100644 codegen/tools/gen_max_kernel_num.py
 create mode 100644 codegen/tools/test/test_gen_max_kernel_num.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f44b650aa1..4651802d568 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1228,6 +1228,32 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   )
   list(APPEND _executorch_kernels executorch_selected_kernels)
 
+  # Auto-right-size the kernel registry unless the user has pinned
+  # MAX_KERNEL_NUM.
+  if(NOT DEFINED CACHE{MAX_KERNEL_NUM} AND NOT DEFINED MAX_KERNEL_NUM)
+    gen_selected_max_kernel_num(
+      LIB_NAME "executorch_selected_kernels" OPLIST_YAMLS
+      ${gen_selected_ops_output_yaml}
+    )
+    target_include_directories(
+      executorch_core
+      PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
+    )
+    add_dependencies(
+      executorch_core executorch_selected_kernels_max_kernel_num_header
+    )
+    if(TARGET executorch_core_shared)
+      target_include_directories(
+        executorch_core_shared
+        PRIVATE ${executorch_selected_kernels_max_kernel_num_include_dir}
+      )
+      add_dependencies(
+        executorch_core_shared
+        executorch_selected_kernels_max_kernel_num_header
+      )
+    endif()
+  endif()
+
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
diff --git a/codegen/tools/gen_max_kernel_num.py b/codegen/tools/gen_max_kernel_num.py
new file mode 100644
index 00000000000..78e1abe2e1e
--- /dev/null
+++ b/codegen/tools/gen_max_kernel_num.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Compute a right-sized MAX_KERNEL_NUM for the ExecuTorch operator registry from
+one or more selected_operators.yaml files (produced by gen_oplist.py) and emit
+it as a C header.
+
+Total = sum of (op, kernel_key) variants across all input YAMLs
+      + prim ops always registered by kernels/prim_ops/register_prim_ops.cpp.
+
+See runtime/kernel/operator_registry.cpp for how the emitted header is
+consumed and the full precedence order. Users that register kernels outside
+the selective-build YAML should pin the registry explicitly with
+-DMAX_KERNEL_NUM=N.
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import yaml
+
+
+HEADER_TEMPLATE = """\
+// @generated by executorch/codegen/tools/gen_max_kernel_num.py. Do not edit.
+#pragma once
+#define EXECUTORCH_SELECTED_MAX_KERNEL_NUM {count}
+"""
+
+# When a YAML opts into include_all_operators, we still need to write *some*
+# output file to keep CMake's add_custom_command contract honest, but without
+# defining EXECUTORCH_SELECTED_MAX_KERNEL_NUM so that operator_registry.cpp
+# falls through to its compile-time default.
+OPT_OUT_HEADER = """\
+// @generated by executorch/codegen/tools/gen_max_kernel_num.py. Do not edit.
+#pragma once
+// Selective build opted into all operators; registry uses compile-time default.
+"""
+
+# Locates the `static Kernel prim_ops[] = { ... };` array literal. The count
+# lives in the array itself (`kernel_span` uses `sizeof(prim_ops)/sizeof(Kernel)`
+# at compile time), so we just bracket-match the array body and count Kernel(
+# entries inside it, ignoring the rest of the file.
+PRIM_OPS_ARRAY_RE = re.compile(
+    r"static\s+Kernel\s+prim_ops\s*\[\s*\]\s*=\s*\{(.*?)^\};",
+    re.DOTALL | re.MULTILINE,
+)
+PRIM_OPS_KERNEL_RE = re.compile(r"\bKernel\s*\(")
+
+
+def _count_prim_ops(prim_ops_source: Path) -> int:
+    source = prim_ops_source.read_text()
+    match = PRIM_OPS_ARRAY_RE.search(source)
+    if match is None:
+        raise RuntimeError(
+            f"Failed to locate `static Kernel prim_ops[] = {{ ... }};` in "
+            f"{prim_ops_source}. The array may have been renamed; update "
+            "PRIM_OPS_ARRAY_RE in gen_max_kernel_num.py."
+        )
+    count = len(PRIM_OPS_KERNEL_RE.findall(match.group(1)))
+    if count == 0:
+        raise RuntimeError(
+            f"Found `prim_ops[]` in {prim_ops_source} but it contains zero "
+            "Kernel(...) entries. The array layout may have changed."
+        )
+    return count
+
+
+def _count_yaml_kernels(yaml_path: Path) -> Optional[int]:
+    """Returns the kernel count for one YAML, or None if the YAML opts into
+    include_all_operators / include_all_overloads (callers should skip the
+    auto-size header in that case)."""
+    with open(yaml_path, "r") as f:
+        data = yaml.safe_load(f) or {}
+
+    if data.get("include_all_operators"):
+        return None
+
+    operators: Dict[str, Dict[str, Any]] = data.get("operators") or {}
+    for _op_name, op_info in operators.items():
+        if isinstance(op_info, dict) and op_info.get("include_all_overloads"):
+            return None
+
+    et_kernel_metadata: Dict[str, List[str]] = data.get("et_kernel_metadata") or {}
+
+    count = 0
+    seen = set()
+    for op_name, variants in et_kernel_metadata.items():
+        seen.add(op_name)
+        if isinstance(variants, list) and variants:
+            count += len(variants)
+        else:
+            count += 1
+
+    # Operators listed but missing from et_kernel_metadata still register one
+    # default kernel each.
+    for op_name in operators:
+        if op_name not in seen:
+            count += 1
+
+    return count
+
+
+def _write_if_different(path: Path, content: str) -> None:
+    if path.exists() and path.read_text() == content:
+        return
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content)
+
+
+def gen_max_kernel_num(
+    oplist_yamls: List[Path],
+    prim_ops_source: Path,
+    output_path: Path,
+) -> Optional[int]:
+    total = 0
+    for yaml_path in oplist_yamls:
+        yaml_count = _count_yaml_kernels(yaml_path)
+        if yaml_count is None:
+            print(
+                f"gen_max_kernel_num: {yaml_path} opts into all operators; "
+                "emitting opt-out header (registry will use default size).",
+                file=sys.stderr,
+            )
+            _write_if_different(output_path, OPT_OUT_HEADER)
+            return None
+        total += yaml_count
+
+    total += _count_prim_ops(prim_ops_source)
+
+    _write_if_different(output_path, HEADER_TEMPLATE.format(count=total))
+    return total
+
+
+def main(argv: List[str]) -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--oplist-yaml",
+        "--oplist_yaml",
+        action="append",
+        required=True,
+        help="Path to a selected_operators.yaml. May be repeated.",
+    )
+    parser.add_argument(
+        "--prim-ops-source",
+        "--prim_ops_source",
+        required=True,
+        help="Path to kernels/prim_ops/register_prim_ops.cpp.",
+    )
+    parser.add_argument(
+        "--output-path",
+        "--output_path",
+        required=True,
+        help="Path to the header file to emit.",
+    )
+    args = parser.parse_args(argv)
+
+    count = gen_max_kernel_num(
+        oplist_yamls=[Path(p) for p in args.oplist_yaml],
+        prim_ops_source=Path(args.prim_ops_source),
+        output_path=Path(args.output_path),
+    )
+    if count is not None:
+        print(f"gen_max_kernel_num: wrote {args.output_path} (count={count})")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/codegen/tools/test/test_gen_max_kernel_num.py b/codegen/tools/test/test_gen_max_kernel_num.py
new file mode 100644
index 00000000000..1b701ad96d6
--- /dev/null
+++ b/codegen/tools/test/test_gen_max_kernel_num.py
@@ -0,0 +1,226 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+from pathlib import Path
+
+import yaml
+
+from executorch.codegen.tools.gen_max_kernel_num import (
+    _count_prim_ops,
+    _count_yaml_kernels,
+    gen_max_kernel_num,
+)
+
+
+PRIM_OPS_STUB = """\
+// Helper defined above the array; its Kernel( must not be counted.
+void build_helper_kernel() { something::Kernel(unrelated); }
+
+static Kernel prim_ops[] = {
+    Kernel("aten::sym_size.int", sym_size_int),
+    Kernel(
+        "executorch_prim::add.int_int",
+        add_int_int),
+    Kernel("executorch_prim::mul.int_int", mul_int_int),
+};
+
+// Another stray Kernel( below the array; also must not be counted.
+auto misleading = Kernel("not_a_prim_op", nullptr);
+"""
+
+
+def _write_yaml(path: Path, payload: dict) -> None:
+    path.write_text(yaml.safe_dump(payload))
+
+
+class TestGenMaxKernelNum(unittest.TestCase):
+    def setUp(self) -> None:
+        self._tmp = tempfile.TemporaryDirectory()
+        self.tmp = Path(self._tmp.name)
+        self.prim_ops_source = self.tmp / "register_prim_ops.cpp"
+        self.prim_ops_source.write_text(PRIM_OPS_STUB)
+        self.output = self.tmp / "selected_max_kernel_num.h"
+
+    def tearDown(self) -> None:
+        self._tmp.cleanup()
+
+    def test_counts_prim_ops_from_source(self) -> None:
+        self.assertEqual(_count_prim_ops(self.prim_ops_source), 3)
+
+    def test_counts_prim_ops_errors_when_array_missing(self) -> None:
+        empty = self.tmp / "empty.cpp"
+        empty.write_text("// no kernels here\n")
+        with self.assertRaises(RuntimeError):
+            _count_prim_ops(empty)
+
+    def test_counts_prim_ops_errors_when_array_empty(self) -> None:
+        empty_array = self.tmp / "empty_array.cpp"
+        empty_array.write_text("static Kernel prim_ops[] = {\n};\n")
+        with self.assertRaises(RuntimeError):
+            _count_prim_ops(empty_array)
+
+    def test_counts_single_variant_per_op(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {
+                    "aten::add.out": {"is_root_operator": True},
+                    "aten::mul.out": {"is_root_operator": True},
+                },
+                "et_kernel_metadata": {
+                    "aten::add.out": ["v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+                    "aten::mul.out": ["v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+                },
+            },
+        )
+        self.assertEqual(_count_yaml_kernels(yaml_path), 2)
+
+    def test_counts_multiple_variants_per_op(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {"aten::add.out": {"is_root_operator": True}},
+                "et_kernel_metadata": {
+                    "aten::add.out": [
+                        "v1/6;0,1|6;0,1|6;0,1|6;0,1",
+                        "v1/3;0,1|3;0,1|3;0,1|3;0,1",
+                    ],
+                },
+            },
+        )
+        self.assertEqual(_count_yaml_kernels(yaml_path), 2)
+
+    def test_counts_ops_without_metadata(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {
+                    "aten::add.out": {"is_root_operator": True},
+                    "aten::mul.out": {"is_root_operator": True},
+                },
+                "et_kernel_metadata": {},
+            },
+        )
+        self.assertEqual(_count_yaml_kernels(yaml_path), 2)
+
+    def test_include_all_operators_returns_none(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "include_all_operators": True,
+                "operators": {},
+                "et_kernel_metadata": {},
+            },
+        )
+        self.assertIsNone(_count_yaml_kernels(yaml_path))
+
+    def test_include_all_overloads_returns_none(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {
+                    "aten::add": {"include_all_overloads": True},
+                },
+                "et_kernel_metadata": {},
+            },
+        )
+        self.assertIsNone(_count_yaml_kernels(yaml_path))
+
+    def test_end_to_end_single_yaml(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {"aten::add.out": {"is_root_operator": True}},
+                "et_kernel_metadata": {
+                    "aten::add.out": ["v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+                },
+            },
+        )
+        total = gen_max_kernel_num(
+            oplist_yamls=[yaml_path],
+            prim_ops_source=self.prim_ops_source,
+            output_path=self.output,
+        )
+        self.assertEqual(total, 1 + 3)
+        self.assertIn(
+            "#define EXECUTORCH_SELECTED_MAX_KERNEL_NUM 4",
+            self.output.read_text(),
+        )
+
+    def test_end_to_end_multiple_yamls(self) -> None:
+        yaml_a = self.tmp / "a.yaml"
+        yaml_b = self.tmp / "b.yaml"
+        _write_yaml(
+            yaml_a,
+            {
+                "operators": {"aten::add.out": {}},
+                "et_kernel_metadata": {"aten::add.out": ["v1/6", "v1/3"]},
+            },
+        )
+        _write_yaml(
+            yaml_b,
+            {
+                "operators": {"aten::mul.out": {}},
+                "et_kernel_metadata": {"aten::mul.out": ["v1/6"]},
+            },
+        )
+        total = gen_max_kernel_num(
+            oplist_yamls=[yaml_a, yaml_b],
+            prim_ops_source=self.prim_ops_source,
+            output_path=self.output,
+        )
+        self.assertEqual(total, 2 + 1 + 3)
+
+    def test_include_all_writes_opt_out_header(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {"include_all_operators": True, "operators": {}, "et_kernel_metadata": {}},
+        )
+        total = gen_max_kernel_num(
+            oplist_yamls=[yaml_path],
+            prim_ops_source=self.prim_ops_source,
+            output_path=self.output,
+        )
+        self.assertIsNone(total)
+        self.assertTrue(self.output.exists())
+        contents = self.output.read_text()
+        self.assertNotIn("#define EXECUTORCH_SELECTED_MAX_KERNEL_NUM", contents)
+        self.assertIn("opted into all operators", contents)
+
+    def test_write_if_different_preserves_mtime(self) -> None:
+        yaml_path = self.tmp / "selected_operators.yaml"
+        _write_yaml(
+            yaml_path,
+            {
+                "operators": {"aten::add.out": {}},
+                "et_kernel_metadata": {"aten::add.out": ["v1/6"]},
+            },
+        )
+        gen_max_kernel_num(
+            oplist_yamls=[yaml_path],
+            prim_ops_source=self.prim_ops_source,
+            output_path=self.output,
+        )
+        first_mtime = self.output.stat().st_mtime_ns
+        gen_max_kernel_num(
+            oplist_yamls=[yaml_path],
+            prim_ops_source=self.prim_ops_source,
+            output_path=self.output,
+        )
+        self.assertEqual(self.output.stat().st_mtime_ns, first_mtime)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index edec9567b7b..c2e14114caa 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -85,6 +85,41 @@ This API lets users pass in a list of operator names. Note that this API can be
 
 This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model.
 
+### Right-sizing the kernel registry
+
+The runtime's operator registry is a fixed-size static array whose capacity
+is set at compile time via `MAX_KERNEL_NUM` (default 2000). Each slot is a
+three-pointer `Kernel` struct (12 bytes on 32-bit, 24 bytes on 64-bit), so
+the array permanently occupies roughly 24&nbsp;KiB on 32-bit targets and
+48&nbsp;KiB on 64-bit — regardless of how many kernels actually register.
+This overhead is most visible on small embedded targets where KiBs of
+static RAM matter. When selective build is active the exact kernel set is
+known at build time, so the registry can be sized to fit.
+
+Any selective-build invocation (`EXECUTORCH_SELECT_OPS_MODEL`,
+`EXECUTORCH_SELECT_OPS_LIST`, or `EXECUTORCH_SELECT_OPS_YAML`) now
+automatically computes a right-sized `MAX_KERNEL_NUM` and propagates it into
+`operator_registry.cpp` via a generated header. No additional flag is
+required.
+
+Resolution order in `operator_registry.cpp`:
+
+1. A user-supplied `-DMAX_KERNEL_NUM=N` always wins.
+2. Otherwise, the auto-computed value from the generated
+   `selected_max_kernel_num.h` is used.
+3. Otherwise (no selective build), the default 2000 is used.
+
+The count is `sum(kernel variants in et_kernel_metadata)` plus the prim ops
+registered by `kernels/prim_ops/register_prim_ops.cpp`. If a YAML opts into
+`include_all_operators`, auto-sizing is skipped and the default capacity
+applies.
+
+If you register kernels outside the selective-build YAML (for example via
+`EXECUTORCH_LIBRARY` macros in your own code), pin the registry explicitly
+with `-DMAX_KERNEL_NUM=N`. A too-small registry aborts at static init with
+`Error::RegistrationExceedingMaxKernels` and a log of every attempted
+kernel.
+
 ### Dtype Selective Build
 
 Beyond pruning the binary to remove unused operators, the binary size can further reduced by removing unused dtypes. For example, if your model only uses floats for the `add` operator, then including variants of the `add` operators for `doubles` and `ints` is unnecessary. The flag `DTYPE_SELECTIVE_BUILD` can be set to `ON` to support this additional optimization. Currently, dtype selective build is only supported with the model API described above. Once enabled, a header file that specifies only the operators and dtypes used by the model is created and linked against a rebuild of the `portable_kernels` lib. This feature is only supported for the portable kernels library; it's not supported for optimized, quantized or custom kernel libraries.
diff --git a/examples/selective_build/test_selective_build.sh b/examples/selective_build/test_selective_build.sh
index 942df7eae04..c1b5c627c42 100755
--- a/examples/selective_build/test_selective_build.sh
+++ b/examples/selective_build/test_selective_build.sh
@@ -146,6 +146,8 @@ test_cmake_select_ops_in_model() {
     local example_dir=examples/selective_build/basic
     local build_dir=cmake-out/${example_dir}
     rm -rf ${build_dir}
+    # No -DMAX_KERNEL_NUM: selective build auto-right-sizes the registry from
+    # the .pte via gen_selected_max_kernel_num().
     retry cmake -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
             -DEXECUTORCH_SELECT_OPS_MODEL="./${model_export_name}" \
             -DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \
@@ -158,6 +160,17 @@ test_cmake_select_ops_in_model() {
     echo "Building ${example_dir}"
     cmake --build ${build_dir} -j9 --config $CMAKE_BUILD_TYPE
 
+    echo "Verifying auto-right-sized MAX_KERNEL_NUM header was generated"
+    local generated_header
+    generated_header=$(find "${build_dir}" -name selected_max_kernel_num.h -print -quit)
+    if [[ -z "${generated_header}" ]]; then
+      echo "ERROR: selected_max_kernel_num.h not generated"
+      exit 1
+    fi
+    grep -q "EXECUTORCH_SELECTED_MAX_KERNEL_NUM" "${generated_header}" \
+      || { echo "ERROR: header missing expected define"; exit 1; }
+    echo "Generated: $(cat ${generated_header} | tail -1)"
+
     echo 'Running selective build test'
     ${build_dir}/selective_build_test --model_path="./${model_export_name}"
 
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index 683f641020a..1bfe3dc45ab 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -20,14 +20,27 @@ namespace ET_RUNTIME_NAMESPACE {
 namespace {
 
 // Maximum number of operators and their associated kernels that can be
-// registered.
-#ifdef MAX_KERNEL_NUM
+// registered. Resolution order:
+//   1. User-defined -DMAX_KERNEL_NUM wins.
+//   2. Otherwise, if selective build generated selected_max_kernel_num.h and
+//      it defines EXECUTORCH_SELECTED_MAX_KERNEL_NUM, use that. (When a
+//      selective-build YAML opts into all operators the header is emitted
+//      without the define, and we fall through.)
+//   3. Otherwise, fall back to a conservative default of 2000 slots.
+#if defined(MAX_KERNEL_NUM)
 constexpr uint32_t kMaxRegisteredKernels = MAX_KERNEL_NUM;
 #else
+#if __has_include(<executorch/runtime/kernel/selected_max_kernel_num.h>)
+#include <executorch/runtime/kernel/selected_max_kernel_num.h>
+#endif
+#if defined(EXECUTORCH_SELECTED_MAX_KERNEL_NUM)
+constexpr uint32_t kMaxRegisteredKernels = EXECUTORCH_SELECTED_MAX_KERNEL_NUM;
+#else
 constexpr uint32_t kMaxOperators = 250;
 constexpr uint32_t kMaxKernelsPerOp = 8;
 constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp;
 #endif
+#endif
 
 // Data that backs the kernel table. Since Kernel has a custom default
 // constructor (implicitly, because it contains KernelKey, which has a custom
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index e838e62c582..4253fa44dc5 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -85,6 +85,69 @@ function(gen_selected_ops)
       WORKING_DIRECTORY ${EXECUTORCH_ROOT}
     )
   endif()
+
+  # Expose the generated YAML path so callers can feed it into
+  # gen_selected_max_kernel_num().
+  set(gen_selected_ops_output_yaml
+      ${_oplist_yaml}
+      PARENT_SCOPE
+  )
+endfunction()
+
+# Generate selected_max_kernel_num.h from one or more selected_operators.yaml
+# files. operator_registry.cpp picks the header up via __has_include when the
+# user has not explicitly set -DMAX_KERNEL_NUM.
+#
+# Invoked as gen_selected_max_kernel_num( LIB_NAME lib_name OPLIST_YAMLS yaml1
+# [yaml2 ...] )
+#
+# Exposes ${LIB_NAME}_max_kernel_num_include_dir in the parent scope — add this
+# to target_include_directories() on whichever target compiles
+# operator_registry.cpp.
+function(gen_selected_max_kernel_num)
+  set(one_value_args LIB_NAME)
+  set(multi_value_args OPLIST_YAMLS)
+  cmake_parse_arguments(
+    GEN "" "${one_value_args}" "${multi_value_args}" ${ARGN}
+  )
+
+  if(NOT GEN_LIB_NAME)
+    message(FATAL_ERROR "gen_selected_max_kernel_num: LIB_NAME is required")
+  endif()
+  if(NOT GEN_OPLIST_YAMLS)
+    message(FATAL_ERROR "gen_selected_max_kernel_num: OPLIST_YAMLS is required")
+  endif()
+
+  set(_prim_ops_src ${EXECUTORCH_ROOT}/kernels/prim_ops/register_prim_ops.cpp)
+  set(_gen_script ${EXECUTORCH_ROOT}/codegen/tools/gen_max_kernel_num.py)
+  set(_include_root ${CMAKE_CURRENT_BINARY_DIR}/${GEN_LIB_NAME})
+  set(_header
+      ${_include_root}/executorch/runtime/kernel/selected_max_kernel_num.h
+  )
+
+  set(_yaml_args "")
+  foreach(_yaml IN LISTS GEN_OPLIST_YAMLS)
+    list(APPEND _yaml_args --oplist-yaml=${_yaml})
+  endforeach()
+
+  set(_gen_command
+      "${PYTHON_EXECUTABLE}" -m codegen.tools.gen_max_kernel_num ${_yaml_args}
+      --prim-ops-source=${_prim_ops_src} --output-path=${_header}
+  )
+
+  add_custom_command(
+    COMMENT "Computing right-sized MAX_KERNEL_NUM for ${GEN_LIB_NAME}"
+    OUTPUT ${_header}
+    COMMAND ${_gen_command}
+    DEPENDS ${GEN_OPLIST_YAMLS} ${_prim_ops_src} ${_gen_script}
+    WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+  )
+  add_custom_target(${GEN_LIB_NAME}_max_kernel_num_header DEPENDS ${_header})
+
+  set(${GEN_LIB_NAME}_max_kernel_num_include_dir
+      ${_include_root}
+      PARENT_SCOPE
+  )
 endfunction()
 
 # Codegen for registering kernels. Kernels are defined in functions_yaml and