From 9e6425954281708e796fbe894d16ee57c6915ae5 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Tue, 9 Jun 2026 12:33:46 -0700
Subject: [PATCH] executorch: log once when the TRT delegate stages host I/O;
 document the device-handling contract

The ExecuTorch TensorRT delegate binds device-resident I/O pointers straight into the execution context (zero-copy). When an I/O tensor is host memory it silently stages through a per-call device allocation plus cudaMemcpyAsync, with no signal. That is an invisible performance cliff, and a silent regression hazard once device-aware memory planning is expected to keep delegate I/O on device. Emit a single Info log the first time an engine stages host I/O.

Also document, at execute()'s header, the delegate's deliberate device-handling contract (runtime pointer-sniffing instead of reading the AOT device metadata, self-managed host/device staging, and the engine-baked device_id as the runtime source of truth) so a future change does not "reconcile" it into breakage.
---
 .../executorch/TensorRTBackend.h              |  3 +++
 .../executorch/TensorRTBackend.cpp            | 25 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h b/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h
index 31383f9e17..ddc2ea89b2 100644
--- a/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h
+++ b/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h
@@ -61,6 +61,9 @@ struct EngineHandle {
   size_t num_outputs = 0;
   int device_id = 0;
   bool unified_memory = false;
+  // Set the first time execute() stages an IO tensor through a host<->device
+  // copy; guards a one-shot Info log (read/written under mu).
+  bool staged_logged = false;
   std::mutex mu;
   // Makes the skip-sync fast path safe to reuse: TensorRT forbids reconfiguring or
   // destroying an execution context while one of its enqueues is in flight, so when
diff --git a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
index b2e3b08232..893f5dad9e 100644
--- a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
+++ b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
@@ -317,6 +317,20 @@ Result<DelegateHandle*> TensorRTBackend::init(
 // Args layout (mirroring the Python exporter):
 //   args[0 .. num_inputs-1]             – input EValues
 //   args[num_inputs .. num_inputs+num_outputs-1] – output EValues
+//
+// Device-handling contract (intentional divergence from the CUDA/AOTI delegate;
+// do not "reconcile" without revisiting this):
+//   1. We ignore the per-tensor device metadata the partitioner emits (the AOT
+//      target_device CompileSpec, serialized into extra_tensor_info's
+//      device_type/device_index). Instead we sniff each pointer at runtime
+//      (is_cuda_accessible_ptr) so the backend stays correct for host- or
+//      device-resident inputs; asserting device_type would reject valid host
+//      inputs today.
+//   2. We stage host<->device ourselves via cudaMemcpyAsync into cached scratch
+//      buffers rather than ExecuTorch device-copy ops or a DeviceAllocator; the
+//      copy is conditional on the runtime pointer check and shares the stream.
+//   3. The engine-baked device_id (applied via cudaSetDevice) is the runtime
+//      source of truth for the GPU, not the AOT target_device.
 // ---------------------------------------------------------------------------
 Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span<EValue*> args) const {
   (void)context;
@@ -565,6 +579,17 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
     }
   }
 
+  // One-shot diagnostic: host-resident I/O was just staged through a device
+  // copy, so this engine is off the zero-copy fast path.
+  if ((input_staged_from_host || output_staged_to_host) && !engine->staged_logged) {
+    ET_LOG(
+        Info,
+        "TensorRTBackend::execute: an I/O tensor is host memory; staging it "
+        "through a device copy (zero-copy requires device-resident tensors). "
+        "Logged once per engine.");
+    engine->staged_logged = true;
+  }
+
   // ------------------------------------------------------------------
   // 4. Enqueue inference on the current CUDA stream
   // ------------------------------------------------------------------