From 9e6425954281708e796fbe894d16ee57c6915ae5 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 9 Jun 2026 12:33:46 -0700 Subject: [PATCH] executorch: log once when the TRT delegate stages host I/O; document the device-handling contract The ExecuTorch TensorRT delegate binds device-resident I/O pointers straight into the execution context (zero-copy). When an I/O tensor is host memory it silently stages through a per-call device allocation plus cudaMemcpyAsync, with no signal. That is an invisible performance cliff, and a silent regression hazard once device-aware memory planning is expected to keep delegate I/O on device. Emit a single Info log the first time an engine stages host I/O. Also document, at execute()'s header, the delegate's deliberate device-handling contract (runtime pointer-sniffing instead of reading the AOT device metadata, self-managed host/device staging, and the engine-baked device_id as the runtime source of truth) so a future change does not "reconcile" it into breakage. --- .../executorch/TensorRTBackend.h | 3 +++ .../executorch/TensorRTBackend.cpp | 25 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h b/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h index 31383f9e17..ddc2ea89b2 100644 --- a/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h +++ b/cpp/include/torch_tensorrt/executorch/TensorRTBackend.h @@ -61,6 +61,9 @@ struct EngineHandle { size_t num_outputs = 0; int device_id = 0; bool unified_memory = false; + // Set the first time execute() stages an IO tensor through a host<->device + // copy; guards a one-shot Info log (read/written under mu). + bool staged_logged = false; std::mutex mu; // Makes the skip-sync fast path safe to reuse: TensorRT forbids reconfiguring or // destroying an execution context while one of its enqueues is in flight, so when diff --git a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp index b2e3b08232..893f5dad9e 100644 --- a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp +++ b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp @@ -317,6 +317,20 @@ Result TensorRTBackend::init( // Args layout (mirroring the Python exporter): // args[0 .. num_inputs-1] – input EValues // args[num_inputs .. num_inputs+num_outputs-1] – output EValues +// +// Device-handling contract (intentional divergence from the CUDA/AOTI delegate; +// do not "reconcile" without revisiting this): +// 1. We ignore the per-tensor device metadata the partitioner emits (the AOT +// target_device CompileSpec, serialized into extra_tensor_info's +// device_type/device_index). Instead we sniff each pointer at runtime +// (is_cuda_accessible_ptr) so the backend stays correct for host- or +// device-resident inputs; asserting device_type would reject valid host +// inputs today. +// 2. We stage host<->device ourselves via cudaMemcpyAsync into cached scratch +// buffers rather than ExecuTorch device-copy ops or a DeviceAllocator; the +// copy is conditional on the runtime pointer check and shares the stream. +// 3. The engine-baked device_id (applied via cudaSetDevice) is the runtime +// source of truth for the GPU, not the AOT target_device. // --------------------------------------------------------------------------- Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span args) const { (void)context; @@ -565,6 +579,17 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* } } + // One-shot diagnostic: host-resident I/O was just staged through a device + // copy, so this engine is off the zero-copy fast path. + if ((input_staged_from_host || output_staged_to_host) && !engine->staged_logged) { + ET_LOG( + Info, + "TensorRTBackend::execute: an I/O tensor is host memory; staging it " + "through a device copy (zero-copy requires device-resident tensors). " + "Logged once per engine."); + engine->staged_logged = true; + } + // ------------------------------------------------------------------ // 4. Enqueue inference on the current CUDA stream // ------------------------------------------------------------------