From 48b49abe7c52246b8eb88b23b8cea017da0f7f85 Mon Sep 17 00:00:00 2001
From: fangyangci <133664123+fangyangci@users.noreply.github.com>
Date: Mon, 22 Jun 2026 19:05:39 -0700
Subject: [PATCH 1/4] fix analyze e2e test (#926)

---
 tests/e2e/test_analyze_e2e.py | 75 +++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 30 deletions(-)
diff --git a/tests/e2e/test_analyze_e2e.py b/tests/e2e/test_analyze_e2e.py
index 9f436b27e..babcf38fb 100644
--- a/tests/e2e/test_analyze_e2e.py
+++ b/tests/e2e/test_analyze_e2e.py
@@ -44,6 +44,7 @@
 import pytest
 from click.testing import CliRunner
 
+from tests.e2e.require_ep import require_ep
 from winml.modelkit.commands.analyze import analyze
 from winml.modelkit.utils.constants import EP_ALIASES as _EP_ALIASES
 from winml.modelkit.utils.constants import SUPPORTED_EPS
@@ -66,30 +67,44 @@ def _invoke(args: list[str]):
     return CliRunner().invoke(analyze, args, obj={}, catch_exceptions=False)
 
 
+def _build_rule_parquet_path(rules_dir: Path, ep: str, device: str, op: str) -> Path:
+    """Build parquet path using standard ``<EP>_<DEVICE>/<file>.parquet`` layout."""
+    provider_dir = rules_dir / f"{ep}_{device.upper()}"
+    provider_dir.mkdir(parents=True, exist_ok=True)
+    return provider_dir / f"{op}_{ep}_{device.upper()}_ai.onnx_opset13.parquet"
+
+
+def _write_rule_with_result(
+    rules_dir: Path,
+    ep: str,
+    device: str,
+    compile_run_success: tuple[bool, bool],
+    op: str = "MatMul",
+) -> Path:
+    """Write a parquet rule with the given compile/run tuple."""
+    parquet = _build_rule_parquet_path(rules_dir, ep, device, op)
+    pd.DataFrame([{"compile_run_success": compile_run_success}]).to_parquet(parquet, index=False)
+    return parquet
+
+
 def _write_supported_rule(rules_dir: Path, ep: str, device: str, op: str = "MatMul") -> Path:
     """Write a minimally-valid "always supported" parquet rule.
 
     The rule has no condition columns — only the ``compile_run_success``
     tuple — so it unconditionally matches every node of the named op.
     """
-    parquet = rules_dir / f"{op}_{ep}_{device}_ai.onnx_opset13.parquet"
-    pd.DataFrame([{"compile_run_success": (True, True)}]).to_parquet(parquet, index=False)
-    return parquet
+    return _write_rule_with_result(rules_dir, ep, device, (True, True), op)
 
 
 def _write_unsupported_rule(rules_dir: Path, ep: str, device: str, op: str = "MatMul") -> Path:
     """Write a parquet rule that classifies the op as unsupported (compile fails)."""
-    parquet = rules_dir / f"{op}_{ep}_{device}_ai.onnx_opset13.parquet"
-    pd.DataFrame([{"compile_run_success": (False, False)}]).to_parquet(parquet, index=False)
-    return parquet
+    return _write_rule_with_result(rules_dir, ep, device, (False, False), op)
 
 
 def _write_partial_rule(rules_dir: Path, ep: str, device: str, op: str = "MatMul") -> Path:
     """Write a parquet rule that classifies the op as partially supported
     (compile succeeds, run fails). No condition columns → unconditional match."""
-    parquet = rules_dir / f"{op}_{ep}_{device}_ai.onnx_opset13.parquet"
-    pd.DataFrame([{"compile_run_success": (True, False)}]).to_parquet(parquet, index=False)
-    return parquet
+    return _write_rule_with_result(rules_dir, ep, device, (True, False), op)
 
 
 @pytest.fixture
@@ -325,42 +340,42 @@ def test_default_device_auto_resolves_single_best_device_for_pinned_ep(
         self,
         onnx_model_path: Path,
         rules_dir: Path,
-        monkeypatch: pytest.MonkeyPatch,
     ) -> None:
         """Omitting ``--device`` resolves a single best device for the pinned EP.
 
-        ``auto`` now picks one target via the shared sysinfo helpers (like
-        build/run): for ``qnn`` locally available on NPU and GPU, the
-        highest-priority device (NPU) is chosen — a single ``(qnn, NPU)`` run.
+        ``auto`` picks one target via the shared sysinfo helpers (like
+        build/run). On a QNN-capable host the highest-priority device is NPU,
+        so ``--ep qnn`` with no ``--device`` resolves to a single ``(qnn, NPU)``
+        run that is fully supported.
 
-        The test is hardware-agnostic: local availability is controlled via the
-        ORT device->EP map monkeypatch rather than real machine capabilities.
+        Real end-to-end: gated on actual QNN availability via ``require_ep``
+        rather than monkeypatching local capabilities. The auto-resolution
+        logic itself is covered hardware-agnostically by the unit-level
+        selection-matrix test.
         """
-        monkeypatch.setattr(
-            "winml.modelkit.sysinfo.device._get_device_ep_map_from_ort",
-            lambda: {
-                "npu": ("QNNExecutionProvider",),
-                "gpu": ("QNNExecutionProvider", "DmlExecutionProvider"),
-                "cpu": ("CPUExecutionProvider",),
-            },
-        )
+        require_ep("qnn")
         _write_supported_rule(rules_dir, "QNNExecutionProvider", "NPU")
         result = _invoke(["-m", str(onnx_model_path), "--ep", "qnn", "--quiet"])
         assert result.exit_code == 0
 
     def test_default_auto_selects_single_ep_when_ep_omitted(
-        self, onnx_model_path: Path, rules_dir: Path
+        self,
+        onnx_model_path: Path,
+        rules_dir: Path,
     ) -> None:
         """Omitting ``--ep`` resolves a single best EP from local availability.
 
-        With a synthetic rule present the run must complete cleanly; the auto
-        axis resolves from the real ORT device map (CPU EP is always available
-        as a fallback), so only documented exit codes are asserted."""
+        On a QNN-capable host the highest-priority device (NPU) and its
+        highest-priority EP (QNN) win, so bare ``auto`` resolves to ``(qnn,
+        NPU)`` and should be fully supported.
+
+        Real end-to-end: gated on actual QNN availability via ``require_ep``
+        rather than monkeypatching local capabilities.
+        """
+        require_ep("qnn")
         _write_supported_rule(rules_dir, "QNNExecutionProvider", "NPU")
         result = _invoke(["-m", str(onnx_model_path), "--quiet"])
-        # Aggregate result depends on whether the resolved EP is fully
-        # supported; only assert documented exit codes.
-        assert result.exit_code in {0, 1, 2}
+        assert result.exit_code == 0
 
 
 # ===========================================================================

From 6df7dbac9e19a781719e86cd0e663cf0a0b88bac Mon Sep 17 00:00:00 2001
From: fangyangci <133664123+fangyangci@users.noreply.github.com>
Date: Mon, 22 Jun 2026 19:45:11 -0700
Subject: [PATCH 2/4] fix analyze coverage bugs (#922)

---
 CHANGELOG.md                                  |  2 +-
 .../modelkit/analyze/core/runtime_checker.py  |  5 +-
 .../analyze/core/runtime_checker_query.py     | 34 +++----
 .../rules/runtime_check_rules/README.md       | 17 ++--
 .../modelkit/analyze/utils/rule_loader.py     | 88 +++++++++---------
 src/winml/modelkit/commands/analyze.py        | 89 +++++++++++--------
 .../op_input_gen/pad_input_generator.py       |  5 ++
 tests/unit/analyze/core/test_qdq.py           |  4 +-
 .../test_runtime_checker_query_parquet.py     | 31 +++++++
 tests/unit/analyze/models/test_rule_loader.py | 65 ++++++++------
 .../unit/analyze/test_static_analyzer_cli.py  | 15 ++++
 11 files changed, 223 insertions(+), 132 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e6ab75d9..9d2f5bfb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -112,7 +112,7 @@ Expand-Archive -Path .\rules-v0.0.3.zip -DestinationPath src\winml\modelkit\anal
 
 `gh release download` skips pre-releases unless you pass `--tag`, so the explicit `v0.0.3` is required.
 
-If you set `MODELKIT_RULES_DIR` anywhere (shell profile, CI pipeline, user env), rename it to `WINMLCLI_RULES_DIR`. Same `os.pathsep`-separated multi-directory semantics; relative paths still resolve from `src/winml/modelkit/analyze/utils/`.
+If you set `MODELKIT_RULES_DIR` anywhere (shell profile, CI pipeline, user env), rename it to `WINMLCLI_RULES_DIR`. It points to a single rules directory (not split on `os.pathsep`); relative paths still resolve from `src/winml/modelkit/analyze/utils/`.
 
 Related PRs: #411 (Parquet migration), #600 (rules zip in release), #627 (versioned filename), #587 (env var rename as part of ModelKit → WinML CLI Wave 1).
 
diff --git a/src/winml/modelkit/analyze/core/runtime_checker.py b/src/winml/modelkit/analyze/core/runtime_checker.py
index 62c5d3d34..4a766c56b 100644
--- a/src/winml/modelkit/analyze/core/runtime_checker.py
+++ b/src/winml/modelkit/analyze/core/runtime_checker.py
@@ -221,12 +221,11 @@ def op_support(
         run_for_node_total_ms = 0
         callback_total_ms = 0
 
-        # Get all nodes from model
-        model_proto = self._model.get_model()
         # Get cached RuntimeCheckerQuery
         query = self._get_query()
+        # Use the same graph snapshot as RuntimeCheckerQuery (post shape inference).
+        nodes = query.model_proto.graph.node
         # Use tqdm for progress unless caller provides a callback
-        nodes = model_proto.graph.node
         iterator = nodes if on_node_result else tqdm.tqdm(nodes)
         for node in iterator:
             node_start = time.perf_counter()
diff --git a/src/winml/modelkit/analyze/core/runtime_checker_query.py b/src/winml/modelkit/analyze/core/runtime_checker_query.py
index 86e98e25a..e58928b34 100644
--- a/src/winml/modelkit/analyze/core/runtime_checker_query.py
+++ b/src/winml/modelkit/analyze/core/runtime_checker_query.py
@@ -62,7 +62,9 @@
     shape_and_dtype_from_valueinfo,
 )
 from ..utils.node_key_utils import build_node_key_by_node_id, resolve_stable_node_key
-from ..utils.rule_loader import resolve_rule_parquet_path
+from ..utils.rule_loader import (
+    resolve_rule_parquet_path,
+)
 from ..utils.timing_utils import make_timing_logger
 from .node_checkers.base import NodeChecker
 from .node_checkers.registry import NodeCheckerRegistry
@@ -1935,13 +1937,13 @@ def _load_parquet_rule_table(
         op_since_version: int,
         is_qdq: bool,
         for_debug: bool = False,
-    ) -> tuple[pd.DataFrame | None, Path | None, _ParquetConditionTree | None]:
+    ) -> tuple[Path, pd.DataFrame | None, _ParquetConditionTree | None]:
         """Load per-op parquet rule table with cache.
 
         Returns:
-            tuple[pd.DataFrame | None, Path | None, _ParquetConditionTree | None]:
-                Loaded dataframe when available, otherwise None,
-                the resolved parquet path used for lookup when found,
+            tuple[Path, pd.DataFrame | None, _ParquetConditionTree | None]:
+                The resolved or expected parquet path for lookup,
+                loaded dataframe when available, otherwise None,
                 and optional pre-built condition tree.
         """
         parquet_name = (
@@ -1950,26 +1952,30 @@ def _load_parquet_rule_table(
         )
         parquet_path = resolve_rule_parquet_path(parquet_name, for_debug=for_debug)
 
+        # This per-instance cache assumes a stable rules location for the query's
+        # lifetime: the rule-dir env vars must not change between calls. The path
+        # is recomputed each call (so reporting reflects the current location),
+        # but a cached None is reused without re-probing the filesystem.
         cache_key = (op_name, op_domain.value, op_since_version, is_qdq)
         if cache_key in self._parquet_rule_table_cache:
-            if parquet_path is not None:
+            if self._parquet_rule_table_cache[cache_key] is not None:
                 _log_parquet_cache_hit(parquet_path, scope="instance")
             return (
-                self._parquet_rule_table_cache[cache_key],
                 parquet_path,
+                self._parquet_rule_table_cache[cache_key],
                 self._parquet_condition_tree_cache.get(cache_key),
             )
 
-        if parquet_path is None:
+        if not parquet_path.exists():
             self._parquet_rule_table_cache[cache_key] = None
             self._parquet_condition_tree_cache[cache_key] = None
-            return None, None, None
+            return parquet_path, None, None
 
         table_df = _get_or_load_parquet_table_global(parquet_path)
         condition_tree = _build_condition_tree(table_df)
         self._parquet_rule_table_cache[cache_key] = table_df
         self._parquet_condition_tree_cache[cache_key] = condition_tree
-        return table_df, parquet_path, condition_tree
+        return parquet_path, table_df, condition_tree
 
     def _run_for_node_with_parquet_rules(
         self,
@@ -2023,7 +2029,7 @@ def _finish(result: PatternRuntime, outcome: str, **extra: Any) -> PatternRuntim
         since_version_ms = _elapsed_ms(since_version_start)
 
         load_table_start = time.perf_counter()
-        table_df, parquet_path, condition_tree = self._load_parquet_rule_table(
+        parquet_path, table_df, condition_tree = self._load_parquet_rule_table(
             node.op_type,
             op_domain,
             op_since_version,
@@ -2031,10 +2037,8 @@ def _finish(result: PatternRuntime, outcome: str, **extra: Any) -> PatternRuntim
             for_debug=for_debug,
         )
         load_table_ms = _elapsed_ms(load_table_start)
-        parquet_file = parquet_path.name if parquet_path is not None else None
-        parquet_path_norm = (
-            _normalize_table_path(parquet_path) if parquet_path is not None else None
-        )
+        parquet_file = parquet_path.name
+        parquet_path_norm = _normalize_table_path(parquet_path)
 
         if table_df is None:
             if run_unknown_op:
diff --git a/src/winml/modelkit/analyze/rules/runtime_check_rules/README.md b/src/winml/modelkit/analyze/rules/runtime_check_rules/README.md
index 698a60a15..5f90fdbf9 100644
--- a/src/winml/modelkit/analyze/rules/runtime_check_rules/README.md
+++ b/src/winml/modelkit/analyze/rules/runtime_check_rules/README.md
@@ -56,7 +56,7 @@ Copy all runtime rule parquet files from:
 
 ### Option 4: Use external rules directories via environment variable
 
-Set `WINMLCLI_RULES_DIR` to one or more directories containing parquet rule artifacts.
+Set `WINMLCLI_RULES_DIR` to a single directory containing parquet rule artifacts.
 
 Important: relative paths are resolved from `src/winml/modelkit/analyze/utils/` (the
 directory of `rule_loader.py`), not from the current terminal working directory.
@@ -64,17 +64,18 @@ directory of `rule_loader.py`), not from the current terminal working directory.
 - Windows (PowerShell, user-level absolute path): `[Environment]::SetEnvironmentVariable("WINMLCLI_RULES_DIR", "C:\*path*\rules", "User")`
 - Windows (PowerShell, user-level repo-relative path): `[Environment]::SetEnvironmentVariable("WINMLCLI_RULES_DIR", "..\..\..\..\..\..\ModelKitArtifacts\rules", "User")`
 
-Multiple directories are supported using `os.pathsep` (`;` on Windows, `:` on Unix-like systems).
+Only one directory is supported. The value is not split on `os.pathsep`; it is treated
+as a single literal directory path.
 
 ## Rule lookup order
 
-The analyzer searches directories in this order:
+`WINMLCLI_RULES_DIR` overrides — it does not augment — the embedded default:
 
-1. Directories listed in `WINMLCLI_RULES_DIR` (left to right)
-2. Embedded default directory: `src/winml/modelkit/analyze/rules/runtime_check_rules/`
-
-`WINMLCLI_RULES_DIR` takes precedence over the embedded default when the same parquet file
-exists in multiple locations.
+- If `WINMLCLI_RULES_DIR` is set, only that single directory is searched. The embedded
+  default directory is **not** consulted, so that directory must contain every parquet
+  rule you need.
+- If `WINMLCLI_RULES_DIR` is unset or empty, only the embedded default directory is searched:
+  `src/winml/modelkit/analyze/rules/runtime_check_rules/`.
 
 ## What happens if parquet rules are missing
 
diff --git a/src/winml/modelkit/analyze/utils/rule_loader.py b/src/winml/modelkit/analyze/utils/rule_loader.py
index a0f08ca0c..84f5dd94e 100644
--- a/src/winml/modelkit/analyze/utils/rule_loader.py
+++ b/src/winml/modelkit/analyze/utils/rule_loader.py
@@ -17,12 +17,12 @@
 
 logger = logging.getLogger(__name__)
 
-#: Environment variable for additional runtime check rules directories.
-#: Use ``os.pathsep`` (`;` on Windows, `:` on Unix) to separate multiple paths.
+#: Environment variable for the runtime check rules directory.
+#: Holds a single directory path; it is not split on ``os.pathsep``.
 WINMLCLI_RULES_DIR_ENV = "WINMLCLI_RULES_DIR"
 
-#: Environment variable for additional runtime debug rule directories.
-#: Use ``os.pathsep`` (`;` on Windows, `:` on Unix) to separate multiple paths.
+#: Environment variable for the runtime debug rule directory.
+#: Holds a single directory path; it is not split on ``os.pathsep``.
 WINMLCLI_RULES_DIR_FOR_DEBUG_ENV = "WINMLCLI_RULES_DIR_FOR_DEBUG"
 
 # Directory containing this module file. Relative env-var entries are resolved from here.
@@ -46,54 +46,59 @@ def _resolve_env_rules_dir_entry(entry: str) -> Path:
     return (_RULE_LOADER_DIR / entry_path).resolve()
 
 
-def _get_env_rules_dirs(env_name: str) -> list[Path]:
-    """Parse ``os.pathsep``-separated env var values into absolute paths."""
-    dirs: list[Path] = []
+def _get_env_rules_dir(env_name: str) -> Path | None:
+    """Resolve the single directory configured in ``env_name``.
+
+    The value is treated as one directory path and is intentionally not split
+    on ``os.pathsep`` -- only a single rules directory is supported. Returns
+    ``None`` when the env var is unset or blank.
+    """
     env_val = os.environ.get(env_name, "").strip()
-    if env_val:
-        for entry in env_val.split(os.pathsep):
-            entry = entry.strip()
-            if entry:
-                dirs.append(_resolve_env_rules_dir_entry(entry))
-    return dirs
+    if not env_val:
+        return None
+    return _resolve_env_rules_dir_entry(env_val)
 
 
 def get_runtime_rules_search_dirs() -> list[Path]:
-    """Return ordered list of directories to search for runtime rule artifacts.
+    """Return the directory to search for runtime rule artifacts.
 
-    The search order is:
-        1. Any extra directories listed in the :data:`WINMLCLI_RULES_DIR` env var
-            (separated by ``os.pathsep``). Absolute paths are used directly;
-            relative paths are resolved relative to this module file directory.
-      2. Default embedded directory (``src/winml/modelkit/analyze/rules/runtime_check_rules/``)
+    Selection behavior:
+        1. If :data:`WINMLCLI_RULES_DIR` is set, use only that directory.
+            Absolute paths are used directly; a relative path is resolved
+            relative to this module file directory.
+        2. If :data:`WINMLCLI_RULES_DIR` is unset/empty, use the embedded default
+            directory (``src/winml/modelkit/analyze/rules/runtime_check_rules/``).
 
     Returns:
-        List of directory Paths (may include non-existent ones; callers filter).
+        Single-element list with the selected directory (the embedded default
+        when the env var is unset). The directory may not exist; callers filter.
     """
-    dirs = _get_env_rules_dirs(WINMLCLI_RULES_DIR_ENV)
-    dirs.append(_DEFAULT_RUNTIME_RULES_DIR)
-    return dirs
+    env_dir = _get_env_rules_dir(WINMLCLI_RULES_DIR_ENV)
+    if env_dir is not None:
+        return [env_dir]
+    return [_DEFAULT_RUNTIME_RULES_DIR]
 
 
 def get_runtime_rules_debug_search_dirs() -> list[Path]:
-    """Return ordered debug-rule directories from env var only.
+    """Return the debug-rule directory from the env var only.
 
     Unlike :func:`get_runtime_rules_search_dirs`, this intentionally has no
-    embedded default fallback directory.
+    embedded default fallback: an empty list is returned when
+    :data:`WINMLCLI_RULES_DIR_FOR_DEBUG` is unset.
     """
-    return _get_env_rules_dirs(WINMLCLI_RULES_DIR_FOR_DEBUG_ENV)
+    env_dir = _get_env_rules_dir(WINMLCLI_RULES_DIR_FOR_DEBUG_ENV)
+    return [env_dir] if env_dir is not None else []
 
 
-def resolve_rule_parquet_path(parquet_filename: str, for_debug: bool = False) -> Path | None:
-    """Resolve a parquet runtime-rule artifact from ``<EP>_<DEVICE>/`` subdirs.
+def resolve_rule_parquet_path(parquet_filename: str, for_debug: bool = False) -> Path:
+    """Resolve preferred parquet runtime-rule path from ``<EP>_<DEVICE>/`` subdirs.
 
     Args:
         parquet_filename: Bare file name, e.g.
             ``Split_QNNExecutionProvider_NPU_ai.onnx_opset13.parquet``
 
     Returns:
-        Resolved Path to the parquet file if found in provider subdirectories;
-        otherwise ``None``.
+        Preferred candidate Path in search order. Existence is not checked here.
     """
 
     def _infer_ep_device_subdir(filename: str) -> str | None:
@@ -108,21 +113,22 @@ def _infer_ep_device_subdir(filename: str) -> str | None:
         return f"{match.group('ep')}_{match.group('device')}"
 
     ep_device_subdir = _infer_ep_device_subdir(parquet_filename)
-    if ep_device_subdir is None:
-        return None
+    relative_path = (
+        Path(ep_device_subdir) / parquet_filename
+        if ep_device_subdir is not None
+        else Path(parquet_filename)
+    )
 
     if for_debug:
-        for debug_dir in get_runtime_rules_debug_search_dirs():
-            candidate_in_subdir = debug_dir / ep_device_subdir / parquet_filename
-            if candidate_in_subdir.exists():
-                return candidate_in_subdir
+        debug_dirs = get_runtime_rules_debug_search_dirs()
+        if debug_dirs:
+            return debug_dirs[0] / relative_path
 
-    for search_dir in get_runtime_rules_search_dirs():
-        candidate_in_subdir = search_dir / ep_device_subdir / parquet_filename
-        if candidate_in_subdir.exists():
-            return candidate_in_subdir
+    search_dirs = get_runtime_rules_search_dirs()
+    if search_dirs:
+        return search_dirs[0] / relative_path
 
-    return None
+    return relative_path
 
 
 class RuleLoader:
diff --git a/src/winml/modelkit/commands/analyze.py b/src/winml/modelkit/commands/analyze.py
index a8dc5f289..07c618f6e 100644
--- a/src/winml/modelkit/commands/analyze.py
+++ b/src/winml/modelkit/commands/analyze.py
@@ -928,45 +928,60 @@ def analyze(
             devices = []
         devices = sorted(d.upper() for d in devices)
 
-        eps: list[EPName | None]
-        if ep == "all":
-            eps = list(SUPPORTED_EPS)
-        elif ep == "auto":
-            # Single highest-priority EP available on the target device. With
-            # device == "all" there is no single device context, so fall back to
-            # the best available device purely for EP selection.
-            if device == "all":
-                try:
-                    ref_device, _ = resolve_device(device="auto")
-                except (ValueError, RuntimeError) as e:
-                    logger.error("Could not auto-select an execution provider: %s", e)
+        execution_pairs: list[tuple[EPName, str]]
+        if ep == "auto" and device == "all":
+            # auto + all: resolve the best available EP per device rather than
+            # picking a single EP from one ref device and fanning it across
+            # unrelated devices. resolve_eps() already returns only EPs that are
+            # valid and locally available for the given device, so the resulting
+            # pairs need no further EP_SUPPORTED_DEVICES filtering.
+            execution_pairs = _sort_ep_device_pairs(
+                [
+                    (device_eps[0], target_device)
+                    for target_device in devices
+                    if (device_eps := resolve_eps(target_device))
+                ]
+            )
+        else:
+            eps: list[EPName | None]
+            if ep == "all":
+                eps = list(SUPPORTED_EPS)
+            elif ep == "auto":
+                # Single highest-priority EP available on the target device.
+                # device == "all" is handled above, so a concrete device context
+                # exists here -- but guard against an empty device list (e.g. a
+                # programmatic ``device=None`` call) so we exit cleanly instead
+                # of raising an unguarded IndexError on ``devices[0]``.
+                ref_device = devices[0] if devices else None
+                if not ref_device:
+                    logger.error("No device context available for EP auto-resolution.")
+                    sys.exit(2)
+                compatible_eps = resolve_eps(ref_device)
+                if not compatible_eps:
+                    logger.error(
+                        "No execution provider is available for device '%s'.", ref_device
+                    )
                     sys.exit(2)
+                eps = [compatible_eps[0]]
             else:
-                ref_device = devices[0]
-            compatible_eps = resolve_eps(ref_device)
-            if not compatible_eps:
-                logger.error("No execution provider is available for device '%s'.", ref_device)
-                sys.exit(2)
-            eps = [compatible_eps[0]]
-        else:
-            # ep is a specific EP or alias
-            eps = [normalize_ep_name(ep)]
-
-        # Build with a for-loop rather than a single nested comprehension so
-        # the `candidate_ep is not None and ... in EP_SUPPORTED_DEVICES`
-        # narrowing carries through to the appended tuple's type (EPName,
-        # not str). The inner generator stays a comprehension to satisfy
-        # ruff PERF401.
-        execution_pairs: list[tuple[EPName, str]] = []
-        for candidate_ep in eps:
-            if candidate_ep is None or candidate_ep not in EP_SUPPORTED_DEVICES:
-                continue
-            execution_pairs.extend(
-                (candidate_ep, candidate_device)
-                for candidate_device in devices
-                if candidate_device.lower() in EP_SUPPORTED_DEVICES[candidate_ep]
-            )
-        execution_pairs = _sort_ep_device_pairs(execution_pairs)
+                # ep is a specific EP or alias
+                eps = [normalize_ep_name(ep)]
+
+            # Build with a for-loop rather than a single nested comprehension so
+            # the `candidate_ep is not None and ... in EP_SUPPORTED_DEVICES`
+            # narrowing carries through to the appended tuple's type (EPName,
+            # not str). The inner generator stays a comprehension to satisfy
+            # ruff PERF401.
+            execution_pairs = []
+            for candidate_ep in eps:
+                if candidate_ep is None or candidate_ep not in EP_SUPPORTED_DEVICES:
+                    continue
+                execution_pairs.extend(
+                    (candidate_ep, candidate_device)
+                    for candidate_device in devices
+                    if candidate_device.lower() in EP_SUPPORTED_DEVICES[candidate_ep]
+                )
+            execution_pairs = _sort_ep_device_pairs(execution_pairs)
 
         # Local pairs are still needed to gate --run-unknown-op probing
         # (_resolve_run_unknown_op). Single-target `auto` selection is already
diff --git a/src/winml/modelkit/pattern/op_input_gen/pad_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/pad_input_generator.py
index 86bd49ffc..37bca04cd 100644
--- a/src/winml/modelkit/pattern/op_input_gen/pad_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/pad_input_generator.py
@@ -86,6 +86,11 @@ def get_input_and_infinite_attribute_combinations(
                 "pads": InputValueConstraint(np.array([0, 0, 1, 1, 0, 0, 1, 1], dtype=np.int64)),
                 "constant_value": InputValueConstraint(np.array(0.0, dtype=np.float32)),
             },
+            {
+                "data": InputShapeConstraint((2, 3, 4, 5)),
+                "pads": InputValueConstraint(np.array([0, 1, 2, 0, 1, 0, 0, 2], dtype=np.int64)),
+                "constant_value": InputValueConstraint(np.array(0.0, dtype=np.float32)),
+            },
             # ===== 5D Input =====
             {
                 "data": InputShapeConstraint((2, 3, 4, 4, 5)),
diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index 4f3a5a76e..cfda4b4fa 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1178,8 +1178,8 @@ class TestIterQDQCombinations:
             ),  # shape 3 * finite attributes 2 * 2 * 2 * optional combinations 2 * 2 * 2 * 4
             (
                 "Pad",
-                1152,
-            ),  # shape 9 * mode 4 * QDQ 4 * is_constant pads 2 * constant_value present/absent 2
+                1280,
+            ),  # shape 10 * mode 4 * QDQ 4 * is_constant pads 2 * constant_value present/absent 2
             # * Tind 2 (axes not used)
             # All Reduce* use this and it is enough
             (
diff --git a/tests/unit/analyze/core/test_runtime_checker_query_parquet.py b/tests/unit/analyze/core/test_runtime_checker_query_parquet.py
index 6fd4746d1..f8331ac17 100644
--- a/tests/unit/analyze/core/test_runtime_checker_query_parquet.py
+++ b/tests/unit/analyze/core/test_runtime_checker_query_parquet.py
@@ -167,6 +167,37 @@ def test_parquet_lookup_omits_debug_details_without_for_debug(
         assert result_parquet.result.run is False
         assert result_parquet.result.debug_details is None
 
+    def test_rules_not_found_reports_expected_table_path_and_file(
+        self,
+        tmp_path: Path,
+        monkeypatch: pytest.MonkeyPatch,
+        patched_query_conditions,
+    ):
+        """Missing parquet should still report the expected lookup path and file name."""
+        del patched_query_conditions
+
+        monkeypatch.setenv("WINMLCLI_RULES_DIR", str(tmp_path))
+
+        model = _build_add_model()
+        node = model.graph.node[0]
+
+        query_parquet = RuntimeCheckerQuery(model, "QNNExecutionProvider", "NPU")
+        query_parquet.node_checkers = []
+        result = query_parquet.run_for_node(node, for_debug=True, run_unknown_op=False)
+
+        assert result.result.no_data is True
+        assert result.result.reason == "rules_not_found"
+
+        debug_details = result.result.debug_details
+        assert isinstance(debug_details, dict)
+
+        expected_file = "Add_QNNExecutionProvider_NPU_ai.onnx_opset13.parquet"
+        expected_suffix = f"QNNExecutionProvider_NPU/{expected_file}"
+
+        assert debug_details.get("table_file") == expected_file
+        table_path = str(debug_details.get("table_path", "")).replace("\\", "/")
+        assert table_path.endswith(expected_suffix)
+
     def test_parquet_lookup_prefers_debug_dir_when_for_debug(
         self,
         tmp_path: Path,
diff --git a/tests/unit/analyze/models/test_rule_loader.py b/tests/unit/analyze/models/test_rule_loader.py
index 263da39e5..a81d73c23 100644
--- a/tests/unit/analyze/models/test_rule_loader.py
+++ b/tests/unit/analyze/models/test_rule_loader.py
@@ -466,20 +466,30 @@ class TestRuntimeRulesSearchDirs:
     """Test get_runtime_rules_search_dirs behavior."""
 
     def test_default_search_dir_included(self, monkeypatch):
-        """Default embedded dir is always in the search list."""
+        """Default embedded dir is used when env var is unset."""
         monkeypatch.delenv("WINMLCLI_RULES_DIR", raising=False)
         dirs = get_runtime_rules_search_dirs()
-        assert len(dirs) >= 1
-        assert dirs[0].name == "runtime_check_rules"
+        assert len(dirs) == 1
+        assert dirs[0] == _DEFAULT_RUNTIME_RULES_DIR
 
-    def test_env_var_adds_dirs(self, monkeypatch):
-        """WINMLCLI_RULES_DIR adds extra search directories."""
-        monkeypatch.setenv("WINMLCLI_RULES_DIR", f"/extra/path1{os.pathsep}/extra/path2")
+    def test_env_var_overrides_default_with_single_dir(self, monkeypatch):
+        """WINMLCLI_RULES_DIR overrides default and uses only that one directory."""
+        monkeypatch.setenv("WINMLCLI_RULES_DIR", "/extra/path1")
         dirs = get_runtime_rules_search_dirs()
-        assert len(dirs) == 3
+        assert len(dirs) == 1
         assert dirs[0] == Path("/extra/path1").resolve()
-        assert dirs[1] == Path("/extra/path2").resolve()
-        assert dirs[2].name == "runtime_check_rules"
+
+    def test_env_var_not_split_on_pathsep(self, monkeypatch):
+        """Only one directory is supported: the value is not split on os.pathsep.
+
+        A value containing os.pathsep is treated as a single literal directory
+        path rather than multiple search dirs, so the embedded default is never
+        silently consulted as a fallback.
+        """
+        monkeypatch.setenv("WINMLCLI_RULES_DIR", f"/extra/path1{os.pathsep}/extra/path2")
+        dirs = get_runtime_rules_search_dirs()
+        assert len(dirs) == 1
+        assert _DEFAULT_RUNTIME_RULES_DIR not in dirs
 
     def test_env_var_relative_path_resolved_from_module_dir(self, monkeypatch):
         """Relative WINMLCLI_RULES_DIR entries are resolved from rule_loader.py dir."""
@@ -488,15 +498,15 @@ def test_env_var_relative_path_resolved_from_module_dir(self, monkeypatch):
 
         dirs = get_runtime_rules_search_dirs()
 
-        assert len(dirs) == 2
+        assert len(dirs) == 1
         assert dirs[0] == (_RULE_LOADER_DIR / relative_entry).resolve()
-        assert dirs[1] == _DEFAULT_RUNTIME_RULES_DIR
 
     def test_env_var_empty_ignored(self, monkeypatch):
         """Empty WINMLCLI_RULES_DIR is treated as unset."""
         monkeypatch.setenv("WINMLCLI_RULES_DIR", "  ")
         dirs = get_runtime_rules_search_dirs()
         assert len(dirs) == 1
+        assert dirs[0] == _DEFAULT_RUNTIME_RULES_DIR
 
 
 class TestRuntimeRules4CharKeyRoundTrip:
@@ -555,16 +565,21 @@ def test_round_trip_all_files_under_runtime_rules_search_dirs(self, monkeypatch,
 class TestRuntimeRulesDebugSearchDirs:
     """Test get_runtime_rules_debug_search_dirs behavior."""
 
-    def test_debug_env_var_adds_dirs(self, monkeypatch):
-        """WINMLCLI_RULES_DIR_FOR_DEBUG adds extra debug search directories."""
+    def test_debug_env_var_single_dir(self, monkeypatch):
+        """WINMLCLI_RULES_DIR_FOR_DEBUG uses a single debug search directory."""
+        monkeypatch.setenv(WINMLCLI_RULES_DIR_FOR_DEBUG_ENV, "/debug/path1")
+        dirs = get_runtime_rules_debug_search_dirs()
+        assert len(dirs) == 1
+        assert dirs[0] == Path("/debug/path1").resolve()
+
+    def test_debug_env_var_not_split_on_pathsep(self, monkeypatch):
+        """Only one debug directory is supported: not split on os.pathsep."""
         monkeypatch.setenv(
             WINMLCLI_RULES_DIR_FOR_DEBUG_ENV,
             f"/debug/path1{os.pathsep}/debug/path2",
         )
         dirs = get_runtime_rules_debug_search_dirs()
-        assert len(dirs) == 2
-        assert dirs[0] == Path("/debug/path1").resolve()
-        assert dirs[1] == Path("/debug/path2").resolve()
+        assert len(dirs) == 1
 
     def test_debug_env_var_relative_path_resolved_from_module_dir(self, monkeypatch):
         """Relative WINMLCLI_RULES_DIR_FOR_DEBUG entries are module-dir relative."""
@@ -599,23 +614,23 @@ def test_resolve_parquet_finds_file_in_env_dir_provider_subdir(self, monkeypatch
             assert result == nested_dir.resolve() / parquet_name
             assert result.exists()
 
-    def test_resolve_parquet_returns_none_when_missing(self, monkeypatch):
-        """When parquet is missing everywhere, resolve returns None."""
+    def test_resolve_parquet_returns_candidate_when_missing(self, monkeypatch):
+        """When parquet is missing, resolve still returns the preferred candidate path."""
         monkeypatch.delenv("WINMLCLI_RULES_DIR", raising=False)
         parquet_name = "missing_rule.parquet"
 
         result = resolve_rule_parquet_path(parquet_name)
-        assert result is None
+        assert result == _DEFAULT_RUNTIME_RULES_DIR / parquet_name
 
-    def test_resolve_parquet_ignores_flat_layout(self, monkeypatch):
-        """Flat parquet under search dir is ignored; provider subdir is required."""
+    def test_resolve_parquet_returns_provider_subdir_candidate_for_flat_layout(self, monkeypatch):
+        """Flat parquet does not affect the returned provider-subdir candidate path."""
         with tempfile.TemporaryDirectory() as tmpdir:
             parquet_name = "Split_QNNExecutionProvider_NPU_ai.onnx_opset13.parquet"
             (Path(tmpdir) / parquet_name).write_bytes(b"PAR1")
             monkeypatch.setenv("WINMLCLI_RULES_DIR", tmpdir)
 
             result = resolve_rule_parquet_path(parquet_name)
-            assert result is None
+            assert result == Path(tmpdir).resolve() / "QNNExecutionProvider_NPU" / parquet_name
 
     def test_resolve_parquet_for_debug_prefers_debug_dir(self, monkeypatch):
         """for_debug=True should prioritize WINMLCLI_RULES_DIR_FOR_DEBUG entries first."""
@@ -634,8 +649,8 @@ def test_resolve_parquet_for_debug_prefers_debug_dir(self, monkeypatch):
             result = resolve_rule_parquet_path(parquet_name, for_debug=True)
             assert result == debug_file.resolve()
 
-    def test_resolve_parquet_for_debug_falls_back_to_rules_dir(self, monkeypatch):
-        """for_debug=True falls back to normal search dirs when debug file is missing."""
+    def test_resolve_parquet_for_debug_returns_debug_candidate_even_if_missing(self, monkeypatch):
+        """for_debug=True returns debug candidate path without checking existence."""
         with tempfile.TemporaryDirectory() as rules_tmp, tempfile.TemporaryDirectory() as debug_tmp:
             parquet_name = "Split_QNNExecutionProvider_NPU_ai.onnx_opset13.parquet"
             rules_file = Path(rules_tmp) / "QNNExecutionProvider_NPU" / parquet_name
@@ -646,4 +661,4 @@ def test_resolve_parquet_for_debug_falls_back_to_rules_dir(self, monkeypatch):
             monkeypatch.setenv(WINMLCLI_RULES_DIR_FOR_DEBUG_ENV, debug_tmp)
 
             result = resolve_rule_parquet_path(parquet_name, for_debug=True)
-            assert result == rules_file.resolve()
+            assert result == Path(debug_tmp).resolve() / "QNNExecutionProvider_NPU" / parquet_name
diff --git a/tests/unit/analyze/test_static_analyzer_cli.py b/tests/unit/analyze/test_static_analyzer_cli.py
index 9649c2911..d1e412d15 100644
--- a/tests/unit/analyze/test_static_analyzer_cli.py
+++ b/tests/unit/analyze/test_static_analyzer_cli.py
@@ -1225,6 +1225,20 @@ class TestAnalyzeEPDeviceSelectionMatrix:
                 None,
             ),
             ("openvino", "gpu", 0, [("OpenVINOExecutionProvider", "GPU")], None),
+            # ep=auto, device=all: best available EP *per device* rather than one
+            # ref-device EP fanned across all devices. GPU->NvTensorRTRTX,
+            # NPU->OpenVINO, CPU->OpenVINO from the simulated local matrix.
+            (
+                None,
+                "all",
+                0,
+                [
+                    ("NvTensorRTRTXExecutionProvider", "GPU"),
+                    ("OpenVINOExecutionProvider", "NPU"),
+                    ("OpenVINOExecutionProvider", "CPU"),
+                ],
+                None,
+            ),
             # ep=all, device=all: every (ep, device) combo allowed by EP_SUPPORTED_DEVICES.
             (
                 "all",
@@ -1253,6 +1267,7 @@ class TestAnalyzeEPDeviceSelectionMatrix:
             "qnn-empty",
             "qnn-all",
             "openvino-gpu",
+            "auto-all",
             "all-all",
         ],
     )

From 32a8447579df34b61e0d865f33fb28bb5fd04ebe Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhiwang@microsoft.com>
Date: Tue, 23 Jun 2026 11:25:59 +0800
Subject: [PATCH 3/4] fix: declare psutil as a runtime dependency (#937)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

`winml perf` crashes on a clean install with `No module named 'psutil'`
because `src/winml/modelkit/session/monitor/memory_tracker.py` imports
`psutil` at module level, and the perf flow imports that module
unconditionally. `psutil` was never declared in `[project].dependencies`
— only the dev type stub `types-psutil` is present — so the published
wheel's `Requires-Dist` omits it, breaking `winml perf` (and `--monitor`
/ `--memory`) out-of-the-box for every user.

Regression from #861 (`feat: add --memory flag`); `memory_tracker.py`
did not exist in v0.1.0, so `winml perf` was unaffected there.
Installing `psutil` manually confirms perf/build/`--monitor` otherwise
work correctly — the only defect is the missing dependency declaration.

## Change

Add `psutil>=7` to `[project].dependencies` (aligns with the existing
`types-psutil>=7.2.2` stub).

Targeting `release/v0.2.0` directly as a release hotfix; `main` will
pick it up via the post-release merge-back.

Closes #936
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 46fa96dc7..3465a921b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ dependencies = [
   # non-functional but huge diffs across every generated rule artifact. If you
   # truly must bump this, regenerate ALL parquet artifacts in the same change.
   "pandas==2.3.3",
+  "psutil>=7",
   "pydantic>=2",
   "python-multipart>=0.0.22",
   "rapidfuzz>=3.9",

From 1819375da59cb6e0412c26145bad61ebf11b61d1 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhiwang@microsoft.com>
Date: Tue, 23 Jun 2026 21:18:37 +0800
Subject: [PATCH 4/4] Release v0.2.0: CHANGELOG + version bump (#938)

T-2 release prep for **v0.2.0**, targeting `release/v0.2.0` so the
release notes and version land on the branch before tagging.

- **CHANGELOG**: add the v0.2.0 entry covering the 96 PRs merged since
`v0.1.0`.
- **Version**: bump `version` `0.1.0` -> `0.2.0` in `pyproject.toml`.

Merge-back to `main` happens at T+1.
---
 CHANGELOG.md   | 39 +++++++++++++++++++++++++++++++++++++++
 pyproject.toml |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9d2f5bfb0..68fb2f1f5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,45 @@ All notable changes to this project are documented in this file.
 
 The format is loosely based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
+## WinML CLI v0.2.0
+
+This cycle unifies **task detection** across the CLI (modality- and architecture-aware) and expands the eval and perf surfaces — new depth-estimation and tensor-similarity evaluators, a full SA eval pipeline with an HTML report, `winml perf --memory` / `--ep-options`, and `--format json` on `eval` / `analyze` / `perf`. `winml compile` gains a multi-model shared EP context, `winml build` gains `--precision`, and timm image-classification is supported. See the behavior changes below.
+
+### ⚠️ Behavior changes
+
+- `winml perf` no longer compiles by default — added `--compile/--no-compile`, defaulting to no-compile (#879).
+- Boolean CLI options are now `--flag/--no-flag` pairs (#844).
+- Telemetry is enabled in the shipped wheel; consent reworded as "unlinked pseudonymized" (#810).
+
+### ✨ Improvements
+
+- **Task detection** — modality- and architecture-aware `detect_task`, unified across commands via `resolve_task` / `TaskResolution` (#807, #841, #878).
+- `winml perf` — `--memory` reports RAM/VRAM per phase (#861); `--ep-options` passes runtime EP options (#865, #889); output now shows the model path and precision (#875).
+- `winml compile` — multi-model shared EP context with a selectable backend (#871).
+- `winml build` — added `--precision` (#914).
+- `winml inspect` — renders composite (pipeline-led) model structure (#903).
+- `winml analyze` — `--ep` / `--device` auto resolves to a single best target (#919); faster re-runs plus a `--debug` rule locator (#906).
+- `winml eval` — new SA eval pipeline with per-stage perf and an HTML report (#599); depth-estimation (#326, #437) and tensor-similarity (#805) evaluators; scripts track ONNX size and sanitize output (#755).
+- Cross-command — `--format json` on `eval` / `analyze` / `perf` (#855); `--allow-unsupported-nodes` on `perf` / `build` / `eval` / `run` (#821).
+- Quality of life — timm image-classification via library routing (#790); `~` expanded in paths (#815); progress bar during EP warmup (#788); refreshed `--list-device` coloring (#812).
+
+### 🐛 Fixes
+
+- **`winml perf`** — declared `psutil` as a runtime dependency, fixing a crash on clean install (#937); composite (dual-encoder) models supported (#866); HF and ONNX paths unified through `PerfBenchmark` (#659); `--monitor` live chart in `--module` mode (#654, #920); `rich` Live thread crashes (#832).
+- **`winml analyze`** — coverage-counting bugs (#922); analyzer API EP list matches the CLI (#803); Pad / Gemm rule conflicts (#906).
+- **Task / config validation** — fill-mask heads detected as `text2text-generation` (#851); vision feature-extraction model-task inconsistency (#786); model task validated in config (#723); full encoder-decoder composite built for no-task seq2seq (#850, #862); device/EP combination validated without a system check (#780).
+- **`winml export`** — `.data` files written to the output dir, not the cwd (#853); timm `image_size` from `pretrained_cfg` (#806).
+- **`winml inspect` / `winml catalog`** — `--task` validated at parse time (#546, #771); `catalog -t` short flag aligned (#541, #772); VitisAI EP ordered last, catalog table width fixed (#763).
+- **Feature extraction** — `last_hidden_state` now populated in the output (#863).
+- **`winml optimize`** — untie batched constant `MatMul` for OpenVINO GPU (#817).
+- **`winml eval`** — fixed failures on AMD hosts (#783); cleanup runs on `SKIP_*` / exception paths (#890).
+- **CLI output** — quieted `optimum` logger noise (#904); unified verbosity, logger routed to stderr (#566, #793).
+
+### 📦 Assets
+
+- `winml_cli-0.2.0-py3-none-any.whl`
+- `rules-v0.2.0.zip`
+
 ## WinML CLI v0.1.0
 
 First **public preview** release. With the Windows ML 2.0 baseline now in place, this release shifts focus to polishing the CLI surface: faster `winml inspect` / `winml eval`, more accurate device & EP resolution, a real PyPI release pipeline, and a meaningful pass over sysinfo and quantization behavior.
diff --git a/pyproject.toml b/pyproject.toml
index 3465a921b..3e5d69d1f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = [ "setuptools>=61", "wheel" ]
 
 [project]
 name = "winml-cli"
-version = "0.1.0"
+version = "0.2.0"
 description = "Accelerate Model Deployment on WinML"
 readme = "README.md"
 keywords = [ "onnx", "winml" ]