Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/include/torch_tensorrt/executorch/TensorRTBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ struct EngineHandle {
size_t num_outputs = 0;
int device_id = 0;
bool unified_memory = false;
// Set the first time execute() stages an IO tensor through a host<->device
// copy; guards a one-shot Info log (read/written under mu).
bool staged_logged = false;
std::mutex mu;
// Makes the skip-sync fast path safe to reuse: TensorRT forbids reconfiguring or
// destroying an execution context while one of its enqueues is in flight, so when
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,20 @@ Result<DelegateHandle*> TensorRTBackend::init(
// Args layout (mirroring the Python exporter):
// args[0 .. num_inputs-1] – input EValues
// args[num_inputs .. num_inputs+num_outputs-1] – output EValues
//
// Device-handling contract (intentional divergence from the CUDA/AOTI delegate;
// do not "reconcile" without revisiting this):
// 1. We ignore the per-tensor device metadata the partitioner emits (the AOT
// target_device CompileSpec, serialized into extra_tensor_info's
// device_type/device_index). Instead we sniff each pointer at runtime
// (is_cuda_accessible_ptr) so the backend stays correct for host- or
// device-resident inputs; asserting device_type would reject valid host
// inputs today.
// 2. We stage host<->device ourselves via cudaMemcpyAsync into cached scratch
// buffers rather than ExecuTorch device-copy ops or a DeviceAllocator; the
// copy is conditional on the runtime pointer check and shares the stream.
// 3. The engine-baked device_id (applied via cudaSetDevice) is the runtime
// source of truth for the GPU, not the AOT target_device.
// ---------------------------------------------------------------------------
Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle* handle, Span<EValue*> args) const {
(void)context;
Expand Down Expand Up @@ -565,6 +579,17 @@ Error TensorRTBackend::execute(BackendExecutionContext& context, DelegateHandle*
}
}

// One-shot diagnostic: host-resident I/O was just staged through a device
// copy, so this engine is off the zero-copy fast path.
if ((input_staged_from_host || output_staged_to_host) && !engine->staged_logged) {
ET_LOG(
Info,
"TensorRTBackend::execute: an I/O tensor is host memory; staging it "
"through a device copy (zero-copy requires device-resident tensors). "
"Logged once per engine.");
engine->staged_logged = true;
}

// ------------------------------------------------------------------
// 4. Enqueue inference on the current CUDA stream
// ------------------------------------------------------------------
Expand Down
Loading