From b2b0d446775d1fe9ebc5e07de2b0ce7fa165bbc6 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Tue, 9 Jun 2026 12:39:54 -0700
Subject: [PATCH 1/2] executorch: derive the TensorRT delegate target_device
 from the engine's real device index

TensorRTPartitioner hardcoded target_device=cuda:0 for every partition, so a cuda:N engine shipped a .pte whose delegate-boundary tensors were labeled cuda:0. The runtime still ran on the correct GPU (it reads the device from the engine blob), but ExecuTorch's device-aware memory planning reads this metadata to place buffers, so the label needs to be correct once that planning becomes the default.

Derive target_device per export from the engine's real device index, reusing the backend's own _get_engine_info_from_edge_program + _parse_device_id so the index cannot drift from the runtime blob. Fall back to cuda:0 when the program does not have exactly one engine node (multiple TRT partitions) or the index is unreadable; per-partition multi-GPU labeling is left to a follow-up. An explicit caller-provided target_device is used verbatim, unchanged.

Also document that target_device is AOT-only metadata: the runtime selects the GPU from the serialized engine blob, not from this value.
---
 py/torch_tensorrt/executorch/partitioner.py | 73 +++++++++++++++++----
 1 file changed, 62 insertions(+), 11 deletions(-)
diff --git a/py/torch_tensorrt/executorch/partitioner.py b/py/torch_tensorrt/executorch/partitioner.py
index 7fde508450..c7637a90b7 100644
--- a/py/torch_tensorrt/executorch/partitioner.py
+++ b/py/torch_tensorrt/executorch/partitioner.py
@@ -1,5 +1,6 @@
 # ExecuTorch partitioner: partition by execute_engine nodes.
 
+import logging
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -12,7 +13,12 @@
 from executorch.exir.backend.utils import tag_constant_data
 from torch.export import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-from torch_tensorrt.executorch.backend import TensorRTBackend
+from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import DEVICE_IDX
+from torch_tensorrt.executorch.backend import (
+    TensorRTBackend,
+    _get_engine_info_from_edge_program,
+    _parse_device_id,
+)
 from torch_tensorrt.executorch.operator_support import TensorRTOperatorSupport
 
 # Key recognized by ExecuTorch's PropagateDevicePass that tags delegate I/O
@@ -29,6 +35,8 @@
 except ImportError:
     _TARGET_DEVICE_COMPILE_SPEC_KEY = "target_device"
 
+logger = logging.getLogger(__name__)
+
 
 class TensorRTPartitioner(Partitioner):  # type: ignore[misc]
     """Partitions the graph for TensorRT delegation.
@@ -42,6 +50,11 @@ class TensorRTPartitioner(Partitioner):  # type: ignore[misc]
     Callers targeting a non-default GPU should pre-populate
     ``compile_specs`` with the desired ``CompileSpec("target_device",
     b"cuda:<index>")`` to override the default.
+
+    Note: ``target_device`` is AOT metadata only -- it drives ExecuTorch's
+    PropagateDevicePass tagging at export time. At runtime the C++ backend
+    selects the GPU from the device baked into the serialized engine blob,
+    not from this value.
     """
 
     def __init__(
@@ -50,21 +63,45 @@ def __init__(
     ) -> None:
         super().__init__()
         self.compile_specs = list(compile_specs) if compile_specs else []
-        # Mirror CudaPartitioner: emit a target_device CompileSpec so that
-        # ExecuTorch's PropagateDevicePass tags delegate I/O TensorSpecs with
-        # the correct device, which is then serialized into the .pte's
-        # extra_tensor_info.device_type field.
-        if not any(
+        # Mirror CudaPartitioner: a target_device CompileSpec drives ExecuTorch's
+        # PropagateDevicePass, which tags delegate I/O TensorSpecs with the device
+        # and serializes it into the .pte's extra_tensor_info. When the caller pins
+        # it we use that verbatim; otherwise it is derived per export from the
+        # engine's real device in partition() (engine nodes are not available here)
+        # so a cuda:N engine is not mislabeled cuda:0.
+        self._has_explicit_target_device = any(
             s.key == _TARGET_DEVICE_COMPILE_SPEC_KEY for s in self.compile_specs
-        ):
-            self.compile_specs.append(
-                CompileSpec(_TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0")
-            )
+        )
         self.delegation_spec = DelegationSpec(
             backend_id=TensorRTBackend.__name__,
             compile_specs=self.compile_specs,
         )
 
+    def _resolve_target_device(self, exported_program: ExportedProgram) -> bytes:
+        """Best-effort ``target_device`` for the delegate-boundary TensorSpecs.
+
+        Reuses the backend's own engine-info extraction so the device index
+        cannot drift from the runtime blob. Any extraction failure -- no single
+        engine node (zero or multiple TRT partitions) or an unreadable index --
+        falls back to ``cuda:0``; per-partition multi-GPU labeling is left to a
+        follow-up.
+        """
+        try:
+            engine_info = _get_engine_info_from_edge_program(exported_program)
+            return f"cuda:{_parse_device_id(engine_info[DEVICE_IDX])}".encode()
+        except Exception as e:
+            # Broad by design: any extraction failure must fall back, not abort
+            # the export. Warn so a non-default GPU silently labeled cuda:0 stays
+            # diagnosable.
+            logger.warning(
+                "Could not derive target_device from the TensorRT engine (%s); "
+                "falling back to cuda:0. A non-default GPU engine may be "
+                'mislabeled -- pin it via CompileSpec("target_device", '
+                'b"cuda:<index>").',
+                e,
+            )
+            return b"cuda:0"
+
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
@@ -73,12 +110,26 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         )
         partition_list = capability_partitioner.propose_partitions()
 
+        if self._has_explicit_target_device:
+            delegation_spec = self.delegation_spec
+        else:
+            delegation_spec = DelegationSpec(
+                backend_id=TensorRTBackend.__name__,
+                compile_specs=self.compile_specs
+                + [
+                    CompileSpec(
+                        _TARGET_DEVICE_COMPILE_SPEC_KEY,
+                        self._resolve_target_device(exported_program),
+                    )
+                ],
+            )
+
         partition_tags: Dict[str, DelegationSpec] = {}
         for partition in partition_list:
             tag = f"tensorrt_{partition.id}"
             for node in partition.nodes:
                 node.meta["delegation_tag"] = tag
-            partition_tags[tag] = self.delegation_spec
+            partition_tags[tag] = delegation_spec
 
         tag_constant_data(exported_program)
 

From 7d40e29920ea0845a04635b38dbce1c92b870555 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Tue, 9 Jun 2026 12:42:59 -0700
Subject: [PATCH 2/2] executorch: unit-test the TensorRT partitioner
 target_device derivation

Covers the per-export device derivation: target_device taken from the engine's
real device index, a cuda:0 fallback when the engine info is unreadable (for
example multiple TRT partitions), and an explicit caller-provided target_device
used verbatim. CPU-only unit test that monkeypatches the capability partitioner
and engine-info extraction, matching the existing tests/py/dynamo/executorch
style.
---
 .../test_partitioner_target_device.py         | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 tests/py/dynamo/executorch/test_partitioner_target_device.py

diff --git a/tests/py/dynamo/executorch/test_partitioner_target_device.py b/tests/py/dynamo/executorch/test_partitioner_target_device.py
new file mode 100644
index 0000000000..86a3a16ae0
--- /dev/null
+++ b/tests/py/dynamo/executorch/test_partitioner_target_device.py
@@ -0,0 +1,118 @@
+from types import SimpleNamespace
+
+import pytest
+
+executorch = pytest.importorskip("executorch.exir")
+
+import torch  # noqa: E402
+from executorch.exir.backend.compile_spec_schema import CompileSpec  # noqa: E402
+from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (  # noqa: E402
+    DEVICE_IDX,
+    ENGINE_IDX,
+    SERIALIZATION_LEN,
+)
+from torch_tensorrt.executorch.partitioner import (  # noqa: E402
+    _TARGET_DEVICE_COMPILE_SPEC_KEY,
+    TensorRTPartitioner,
+)
+
+
+# A realistic single-engine edge program so partition() runs the *real*
+# _get_engine_info_from_edge_program / _parse_device_id path. That is what
+# guards "an engine node is present and its device is extractable at partition()
+# time" -- a monkeypatched extractor would not. Mirrors the mocked edge programs
+# in tests/py/dynamo/executorch/test_backend.py.
+class _SchemaTarget:
+    def __init__(self, name):
+        self._schema = SimpleNamespace(name=name)
+
+
+def _engine_node(device_id):
+    engine_info = [""] * SERIALIZATION_LEN
+    engine_info[ENGINE_IDX] = torch.frombuffer(bytearray(b"engine"), dtype=torch.uint8)
+    engine_info[DEVICE_IDX] = device_id
+    return SimpleNamespace(
+        op="call_function",
+        target=_SchemaTarget("tensorrt::no_op_placeholder_for_execute_engine"),
+        args=(["x"], *engine_info),
+        name="trt_node",
+    )
+
+
+def _edge_program(*nodes):
+    return SimpleNamespace(
+        graph_module=SimpleNamespace(graph=SimpleNamespace(nodes=list(nodes))),
+        constants={},
+    )
+
+
+class _FakeCapabilityPartitioner:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def propose_partitions(self):
+        return [SimpleNamespace(id=1, nodes=[SimpleNamespace(meta={})])]
+
+
+@pytest.fixture(autouse=True)
+def _stub_partition_internals(monkeypatch):
+    # Both need a real fx GraphModule, so stub them out -- the engine-info
+    # extraction under test still runs for real against the mocked node.
+    monkeypatch.setattr(
+        "torch_tensorrt.executorch.partitioner.CapabilityBasedPartitioner",
+        _FakeCapabilityPartitioner,
+    )
+    monkeypatch.setattr(
+        "torch_tensorrt.executorch.partitioner.tag_constant_data",
+        lambda exported_program: None,
+    )
+
+
+def _target_device(result):
+    spec = result.partition_tags["tensorrt_1"]
+    for cs in spec.compile_specs:
+        if cs.key == _TARGET_DEVICE_COMPILE_SPEC_KEY:
+            return cs.value
+    return None
+
+
+@pytest.mark.unit
+def test_target_device_derived_for_default_gpu():
+    result = TensorRTPartitioner().partition(_edge_program(_engine_node("0")))
+    assert _target_device(result) == b"cuda:0"
+
+
+@pytest.mark.unit
+def test_target_device_derived_for_nonzero_gpu():
+    # The bug this fixes: a cuda:1 engine must not be mislabeled cuda:0.
+    result = TensorRTPartitioner().partition(_edge_program(_engine_node("1")))
+    assert _target_device(result) == b"cuda:1"
+
+
+@pytest.mark.unit
+def test_target_device_falls_back_to_cuda0_on_multiple_engines():
+    # >1 engine node -> real extraction raises -> contract fallback to cuda:0.
+    result = TensorRTPartitioner().partition(
+        _edge_program(_engine_node("1"), _engine_node("2"))
+    )
+    assert _target_device(result) == b"cuda:0"
+
+
+@pytest.mark.unit
+def test_target_device_falls_back_to_cuda0_on_malformed_graph():
+    # An unexpected graph shape makes the real extraction raise; the broadened
+    # except must still fall back to cuda:0 rather than abort the export.
+    bad_node = SimpleNamespace(op="call_function", target=SimpleNamespace(), name="x")
+    result = TensorRTPartitioner().partition(_edge_program(bad_node))
+    assert _target_device(result) == b"cuda:0"
+
+
+@pytest.mark.unit
+def test_explicit_target_device_used_verbatim():
+    # Engine reports cuda:0, but the caller pinned cuda:3 -> the pin wins and
+    # extraction is skipped entirely.
+    partitioner = TensorRTPartitioner(
+        compile_specs=[CompileSpec(_TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:3")]
+    )
+    result = partitioner.partition(_edge_program(_engine_node("0")))
+    assert _target_device(result) == b"cuda:3"