diff --git a/cpp/BUILD b/cpp/BUILD
index 23509b9164..8d74717444 100644
--- a/cpp/BUILD
+++ b/cpp/BUILD
@@ -96,6 +96,17 @@ cc_library(
     strip_include_prefix = "include",
 )
 
+cc_library(
+    name = "tensorrt_executorch_weight_streaming_budget",
+    srcs = [
+        "src/torch_tensorrt/executorch/WeightStreamingBudget.cpp",
+    ],
+    hdrs = [
+        "include/torch_tensorrt/executorch/WeightStreamingBudget.h",
+    ],
+    strip_include_prefix = "include",
+)
+
 cc_library(
     name = "tensorrt_executorch_binding_names",
     hdrs = [
@@ -127,6 +138,7 @@ cc_library(
     deps = [
         ":tensorrt_executorch_binding_names",
         ":tensorrt_executorch_blob_header",
+        ":tensorrt_executorch_weight_streaming_budget",
     ] + select({
         ":linux_x86_64": [
             "@executorch//:executorch_headers",
@@ -150,6 +162,7 @@ filegroup(
         "src/torch_tensorrt/executorch/README.md",
         "src/torch_tensorrt/executorch/TensorRTBackend.cpp",
         "src/torch_tensorrt/executorch/TensorRTBlobHeader.cpp",
+        "src/torch_tensorrt/executorch/WeightStreamingBudget.cpp",
     ],
 )
 
@@ -169,5 +182,6 @@ filegroup(
         "include/torch_tensorrt/executorch/TensorRTBackend.h",
         "include/torch_tensorrt/executorch/TensorRTBindingNames.h",
         "include/torch_tensorrt/executorch/TensorRTBlobHeader.h",
+        "include/torch_tensorrt/executorch/WeightStreamingBudget.h",
     ],
 )
diff --git a/cpp/include/torch_tensorrt/executorch/WeightStreamingBudget.h b/cpp/include/torch_tensorrt/executorch/WeightStreamingBudget.h
new file mode 100644
index 0000000000..8fc457542b
--- /dev/null
+++ b/cpp/include/torch_tensorrt/executorch/WeightStreamingBudget.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch_tensorrt {
+namespace executorch_backend {
+
+// Compile spec key that carries the weight streaming budget from export into the
+// delegate. Must match WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY on the Python
+// side (py/torch_tensorrt/executorch/partitioner.py).
+inline constexpr char kWeightStreamingBudgetKey[] = "weight_streaming_budget";
+
+// Result of parsing a weight streaming budget compile spec value.
+//
+// The value is a non-negative decimal integer: an explicit GPU budget in bytes.
+// The automatic budget is intentionally not encoded here: when no budget spec is
+// present, the delegate applies TensorRT's automatic budget itself, mirroring the
+// PyTorch runtimes.
+//
+//   valid == false -> a value was present but could not be parsed (or was
+//                     negative); the caller should reject the program.
+//   valid == true  -> bytes holds the parsed non-negative byte budget.
+struct WsBudget {
+  bool valid = false;
+  int64_t bytes = 0;
+};
+
+// Parses a budget from a raw byte range. The value is NOT assumed to be NUL
+// terminated; only the first nbytes bytes are read.
+WsBudget parse_weight_streaming_budget(const void* value, std::size_t nbytes);
+
+} // namespace executorch_backend
+} // namespace torch_tensorrt
diff --git a/cpp/src/torch_tensorrt/executorch/CMakeLists.txt b/cpp/src/torch_tensorrt/executorch/CMakeLists.txt
index 1b0546b545..09f71b31ca 100644
--- a/cpp/src/torch_tensorrt/executorch/CMakeLists.txt
+++ b/cpp/src/torch_tensorrt/executorch/CMakeLists.txt
@@ -23,6 +23,7 @@ find_package(Threads REQUIRED)
 set(_torchtrt_executorch_sources
     "${CMAKE_CURRENT_LIST_DIR}/TensorRTBackend.cpp"
     "${CMAKE_CURRENT_LIST_DIR}/TensorRTBlobHeader.cpp"
+    "${CMAKE_CURRENT_LIST_DIR}/WeightStreamingBudget.cpp"
 )
 
 add_library(executorch_trt_backend STATIC ${_torchtrt_executorch_sources})
diff --git a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
index b2e3b08232..8c914f6385 100644
--- a/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
+++ b/cpp/src/torch_tensorrt/executorch/TensorRTBackend.cpp
@@ -8,6 +8,7 @@
 #include "torch_tensorrt/executorch/TensorRTBackend.h"
 #include "torch_tensorrt/executorch/TensorRTBindingNames.h"
 #include "torch_tensorrt/executorch/TensorRTBlobHeader.h"
+#include "torch_tensorrt/executorch/WeightStreamingBudget.h"
 
 #include <cstdint>
 #include <cstring>
@@ -225,8 +226,6 @@ Result<DelegateHandle*> TensorRTBackend::init(
     BackendInitContext& context,
     FreeableBuffer* processed,
     ArrayRef<CompileSpec> compile_specs) const {
-  (void)compile_specs;
-
   TORCHTRT_ET_CHECK_NOT_NULL(processed, Error::InvalidArgument, "TensorRTBackend::init: null processed buffer");
   TORCHTRT_ET_CHECK_NOT_NULL(processed->data(), Error::InvalidArgument, "TensorRTBackend::init: null processed buffer");
 
@@ -284,6 +283,130 @@ Result<DelegateHandle*> TensorRTBackend::init(
   TORCHTRT_ET_CHECK_NOT_NULL(
       handle->engine, Error::InvalidProgram, "TensorRTBackend::init: failed to deserialize TensorRT engine");
 
+  // Apply the weight streaming budget before the execution context is created
+  // below: TensorRT forbids changing the budget while a context is active. The
+  // budget is a non-negative decimal byte count and may come from two places, in
+  // order of precedence:
+  //   1. A load-time backend option ("weight_streaming_budget" runtime spec) that
+  //      the caller passes to Module::load(LoadBackendOptionsMap). This lets a
+  //      deployment size the budget for its own GPU without re-exporting.
+  //   2. The same key baked into the .pte as a compile spec at export, used as a
+  //      default when no load-time option is given (and the only channel for
+  //      loaders that cannot pass backend options yet, e.g. Python/Android).
+  // When neither is present and the engine supports streaming, we apply
+  // TensorRT's automatic budget, mirroring what the PyTorch runtimes do on
+  // deserialize. Negative or malformed values are rejected as InvalidProgram.
+  WsBudget ws_request;
+  bool is_explicit = false;
+
+  // (1) A load-time runtime spec takes precedence over the baked compile spec.
+  // The value is a decimal byte string; a non-negative int is also accepted for
+  // small budgets. A present-but-wrong-type or empty option is handled explicitly
+  // so a runtime option is never silently dropped. The const char* returned by
+  // get_runtime_spec points into the caller's LoadBackendOptionsMap storage, which
+  // outlives init(); we parse it immediately and keep only the int64 result.
+  const auto ws_runtime = context.get_runtime_spec<const char*>(kWeightStreamingBudgetKey);
+  if (ws_runtime.ok()) {
+    const char* const value = ws_runtime.get();
+    // The option array need not be NUL terminated (the struct is public), so
+    // bound the scan. An empty value means "unset", so fall through to (2).
+    constexpr std::size_t kRuntimeBudgetMaxScan = 256;
+    std::size_t len = 0;
+    if (value != nullptr) {
+      while (len < kRuntimeBudgetMaxScan && value[len] != '\0') {
+        ++len;
+      }
+    }
+    if (len > 0) {
+      ws_request = parse_weight_streaming_budget(value, len);
+      if (!ws_request.valid) {
+        ET_LOG(Error, "TensorRTBackend::init: malformed weight_streaming_budget runtime option");
+        return Error::InvalidProgram;
+      }
+      is_explicit = true;
+    }
+  } else if (ws_runtime.error() != Error::NotFound) {
+    // The key is present but stored as a non-string type. Accept a non-negative
+    // int for convenience (its 32-bit range only covers budgets under 2 GB);
+    // otherwise reject it so a wrong-typed option is never silently ignored.
+    const auto ws_runtime_int = context.get_runtime_spec<int>(kWeightStreamingBudgetKey);
+    if (ws_runtime_int.ok() && ws_runtime_int.get() >= 0) {
+      ws_request.valid = true;
+      ws_request.bytes = ws_runtime_int.get();
+      is_explicit = true;
+    } else {
+      ET_LOG(
+          Error,
+          "TensorRTBackend::init: weight_streaming_budget runtime option must be a "
+          "non-negative int or a decimal byte string");
+      return Error::InvalidProgram;
+    }
+  }
+
+  // (2) Otherwise fall back to the compile spec baked into the .pte at export.
+  if (!is_explicit) {
+    const CompileSpec* ws_spec = nullptr;
+    for (const auto& spec : compile_specs) {
+      if (spec.key != nullptr && std::strcmp(spec.key, kWeightStreamingBudgetKey) == 0) {
+        if (ws_spec != nullptr) {
+          // The budget must appear at most once; a second match means the spec
+          // list is inconsistent, so reject the program instead of guessing.
+          ET_LOG(Error, "TensorRTBackend::init: duplicate weight_streaming_budget compile spec");
+          return Error::InvalidProgram;
+        }
+        ws_spec = &spec;
+      }
+    }
+    if (ws_spec != nullptr) {
+      ws_request = parse_weight_streaming_budget(ws_spec->value.buffer, ws_spec->value.nbytes);
+      if (!ws_request.valid) {
+        ET_LOG(Error, "TensorRTBackend::init: malformed weight_streaming_budget compile spec");
+        return Error::InvalidProgram;
+      }
+      is_explicit = true;
+    }
+  }
+
+  const int64_t streamable = handle->engine->getStreamableWeightsSize();
+  if (streamable > 0) {
+    // getStreamableWeightsSize is > 0 only when the engine was built with
+    // BuilderFlag::kWEIGHT_STREAMING.
+    int64_t budget;
+    if (is_explicit) {
+      // An explicit budget is a non-negative byte count, clamped to the
+      // streamable size (TensorRT also caps it, but clamp for a clear log).
+      budget = ws_request.bytes > streamable ? streamable : ws_request.bytes;
+    } else {
+      budget = handle->engine->getWeightStreamingAutomaticBudget();
+    }
+    if (!handle->engine->setWeightStreamingBudgetV2(budget)) {
+      if (!is_explicit && handle->engine->setWeightStreamingBudgetV2(0)) {
+        // The automatic budget could not be applied; fall back to budget 0, which
+        // streams all weights (minimum resident memory) and always fits.
+        ET_LOG(Info, "TensorRTBackend::init: automatic weight streaming budget failed; falling back to budget 0 (stream all weights)");
+      } else {
+        ET_LOG(
+            Error,
+            "TensorRTBackend::init: setWeightStreamingBudgetV2 failed (requested=%lld%s)",
+            (long long)budget,
+            is_explicit ? "" : ", and fallback to 0 also failed");
+        return Error::InvalidProgram;
+      }
+    }
+    ET_LOG(
+        Info,
+        "TensorRTBackend::init: weight streaming budget=%lld streamable=%lld scratch=%lld",
+        (long long)handle->engine->getWeightStreamingBudgetV2(),
+        (long long)streamable,
+        (long long)handle->engine->getWeightStreamingScratchMemorySize());
+  } else if (is_explicit) {
+    // A budget was requested but the engine has no streamable weights (it was not
+    // built with enable_weight_streaming=True, or nothing is streamable). The
+    // engine is still valid and runs fully resident, so log and continue rather
+    // than fail; failing here would break mixed multi-engine programs.
+    ET_LOG(Info, "TensorRTBackend::init: weight_streaming_budget ignored; engine does not support weight streaming (build with enable_weight_streaming=True)");
+  }
+
   Error err = initialize_engine_io(*handle);
   if (err != Error::Ok) {
     return err;
diff --git a/cpp/src/torch_tensorrt/executorch/WeightStreamingBudget.cpp b/cpp/src/torch_tensorrt/executorch/WeightStreamingBudget.cpp
new file mode 100644
index 0000000000..3bf9ef405f
--- /dev/null
+++ b/cpp/src/torch_tensorrt/executorch/WeightStreamingBudget.cpp
@@ -0,0 +1,35 @@
+#include "torch_tensorrt/executorch/WeightStreamingBudget.h"
+
+#include <charconv>
+#include <cstddef>
+#include <cstdint>
+#include <system_error>
+
+namespace torch_tensorrt {
+namespace executorch_backend {
+
+WsBudget parse_weight_streaming_budget(const void* value, std::size_t nbytes) {
+  WsBudget result; // valid == false until the value is fully parsed
+
+  if (value == nullptr || nbytes == 0) {
+    return result;
+  }
+  // The value is a non-negative decimal byte budget and is not NUL terminated.
+  // std::from_chars consumes only ASCII digits (no leading whitespace, sign, or
+  // base prefix), so a leftover byte (trailing garbage or an embedded NUL), an
+  // out-of-range value, or a negative leaves the result invalid.
+  const char* const first = static_cast<const char*>(value);
+  const char* const last = first + nbytes;
+  int64_t parsed = 0;
+  const std::from_chars_result fc = std::from_chars(first, last, parsed);
+  if (fc.ec != std::errc() || fc.ptr != last || parsed < 0) {
+    return result;
+  }
+
+  result.valid = true;
+  result.bytes = parsed;
+  return result;
+}
+
+} // namespace executorch_backend
+} // namespace torch_tensorrt
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index f052954efb..3d09b06a06 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -30,32 +30,26 @@
 
 if ENABLED_FEATURES.torchscript_frontend:
     import torch_tensorrt.ts
-    from torch_tensorrt.ts._compiler import compile as torchscript_compile
     from torch_tensorrt.ts._compiler import (
+        compile as torchscript_compile,
         convert_method_to_trt_engine as ts_convert_method_to_trt_engine,
     )
 
 if ENABLED_FEATURES.dynamo_frontend:
     from torch.export import ExportedProgram
-    from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
     from torch_tensorrt.dynamo._compiler import (
+        compile as dynamo_compile,
         convert_exported_program_to_serialized_trt_engine as dynamo_convert_exported_program_to_serialized_trt_engine,
-    )
-    from torch_tensorrt.dynamo._compiler import (
         cross_compile_for_windows as dynamo_cross_compile_for_windows,
-    )
-    from torch_tensorrt.dynamo._compiler import (
         load_cross_compiled_exported_program as dynamo_load_cross_compiled_exported_program,
-    )
-    from torch_tensorrt.dynamo._compiler import (
         save_cross_compiled_exported_program as dynamo_save_cross_compiled_exported_program,
     )
     from torch_tensorrt.dynamo._defaults import default_device
     from torch_tensorrt.dynamo._tracer import (
         get_dynamic_shapes_args,
         get_dynamic_shapes_kwargs,
+        trace as dynamo_trace,
     )
-    from torch_tensorrt.dynamo._tracer import trace as dynamo_trace
     from torch_tensorrt.dynamo.utils import get_torch_inputs
 
 logger = logging.getLogger(__name__)
@@ -728,11 +722,26 @@ def save(
 
                 - If both dynamic_shapes and Input objects are provided, the explicit dynamic_shapes
                   parameter takes precedence.
-        kwargs: Additional format-specific kwargs. ``partitioners=`` and
-                ``compile_specs=`` are only used with ``output_format="executorch"``;
-                otherwise they are ignored with a warning. Pass
+        kwargs: Additional format-specific kwargs. ``partitioners=``,
+                ``compile_specs=``, and ``weight_streaming_budget=`` are only used
+                with ``output_format="executorch"``; otherwise they are ignored
+                with a warning. Pass
                 ``compile_specs=[CompileSpec("target_device", b"cuda:<i>")]`` to
                 override the default target device (``cuda:0``).
+                ``weight_streaming_budget`` controls the TensorRT weight streaming
+                budget for the delegate: ``None`` (the default) lets the delegate
+                pick an automatic budget at load, and a non-negative integer sets
+                an explicit GPU budget in bytes. For a program with multiple
+                TensorRT engines an explicit byte budget applies to each engine
+                independently, so prefer ``None`` there. Only an engine built with
+                ``enable_weight_streaming=True`` honors the budget at runtime. This
+                value is an export-time default; it can be overridden at load with a
+                ``weight_streaming_budget`` backend option so the same ``.pte`` can
+                be sized for different GPUs without re-exporting. That load-time
+                override currently works only from runtimes that pass ExecuTorch
+                backend options (the C++ ``Module`` API and iOS); the ExecuTorch
+                Python and Android loaders do not expose it yet and use this
+                export-time value.
     """
     if isinstance(module, CudaGraphsTorchTensorRTModule):
         module = module.compiled_module
@@ -759,6 +768,7 @@ def save(
 
     executorch_partitioners = kwargs.pop("partitioners", None)
     executorch_compile_specs = kwargs.pop("compile_specs", None)
+    executorch_weight_streaming_budget = kwargs.pop("weight_streaming_budget", None)
 
     if output_format not in accepted_formats:
         raise ValueError(
@@ -770,6 +780,16 @@ def save(
             "with executorch.exir. Install with: pip install "
             "\"executorch\" to use output_format='executorch'."
         )
+    # Recognized executorch options are popped above; any other leftover kwarg
+    # for output_format='executorch' is a typo (the executorch path forwards no
+    # other kwargs), so fail loudly instead of silently ignoring it.
+    if output_format == "executorch" and kwargs:
+        raise TypeError(
+            "save() received unexpected keyword argument(s) for "
+            f"output_format='executorch': {sorted(kwargs)}. Supported executorch "
+            "options are 'partitioners', 'compile_specs', and "
+            "'weight_streaming_budget'."
+        )
 
     def _all_are_input_objects(obj: Any) -> bool:
         """Recursively check if all elements in nested collections are Input objects."""
@@ -880,6 +900,16 @@ def _extract_tensor(obj: Any) -> Any:
             "compile_specs= is only used with output_format='executorch' and will "
             f"be ignored for output_format='{output_format}'."
         )
+    if executorch_weight_streaming_budget is not None and output_format != "executorch":
+        logger.warning(
+            "weight_streaming_budget= is only used with output_format='executorch' "
+            f"and will be ignored for output_format='{output_format}'."
+        )
+    if executorch_weight_streaming_budget is not None and output_format == "executorch":
+        # Validate eagerly so an invalid budget (non-int or out-of-range) fails
+        # fast, before any expensive export. It is normalized again later when
+        # building the compile_specs.
+        _normalize_weight_streaming_budget(executorch_weight_streaming_budget)
     if output_format == "aot_inductor" and platform.system() != "Linux":
         raise ValueError(
             f"The AOT Inductor format is only supported on Linux, {platform.system()} is not a supported platform for this format"
@@ -948,6 +978,7 @@ def _extract_tensor(obj: Any) -> Any:
                     file_path,
                     partitioners=executorch_partitioners,
                     compile_specs=executorch_compile_specs,
+                    weight_streaming_budget=executorch_weight_streaming_budget,
                 )
             else:
                 raise RuntimeError(
@@ -1013,6 +1044,7 @@ def _extract_tensor(obj: Any) -> Any:
                         file_path,
                         partitioners=executorch_partitioners,
                         compile_specs=executorch_compile_specs,
+                        weight_streaming_budget=executorch_weight_streaming_budget,
                     )
                 else:
                     raise RuntimeError(
@@ -1099,6 +1131,7 @@ def _extract_tensor(obj: Any) -> Any:
                         file_path,
                         partitioners=executorch_partitioners,
                         compile_specs=executorch_compile_specs,
+                        weight_streaming_budget=executorch_weight_streaming_budget,
                     )
                 else:
                     raise RuntimeError(
@@ -1233,9 +1266,7 @@ def _replace_execute_engine_for_executorch(exp_program: Any) -> Any:
         # Use FX's unique-attr-name helper so re-export passes (which may
         # invoke this rewriter multiple times on the same `gm`) don't
         # silently overwrite earlier engine buffers.
-        from torch.fx.experimental.const_fold import (
-            get_unique_attr_name_in_module,
-        )
+        from torch.fx.experimental.const_fold import get_unique_attr_name_in_module
 
         buffer_name = get_unique_attr_name_in_module(gm, "_trt_engine_0")
         gm.register_buffer(buffer_name, engine_tensor, persistent=True)
@@ -1296,6 +1327,80 @@ def _replace_execute_engine_for_executorch(exp_program: Any) -> Any:
     return exp_program
 
 
+WEIGHT_STREAMING_BUDGET_MAX_BYTES = 2**63
+
+
+def _normalize_weight_streaming_budget(
+    weight_streaming_budget: Any,
+) -> Optional[bytes]:
+    """Validate the budget and encode it as a compile-spec value.
+
+    ``None`` (the default) means automatic: the delegate picks the budget at load.
+    A non-negative integer is an explicit GPU budget in bytes. Returns the ASCII
+    bytes to store in the CompileSpec, or ``None`` when no budget was supplied.
+    """
+    if weight_streaming_budget is None:
+        return None
+    # bool is an int subclass, so reject it explicitly along with non-ints.
+    if isinstance(weight_streaming_budget, bool) or not isinstance(
+        weight_streaming_budget, int
+    ):
+        raise TypeError(
+            "weight_streaming_budget must be a non-negative int (number of bytes) "
+            f"or None for automatic, got {type(weight_streaming_budget).__name__}."
+        )
+    if (
+        weight_streaming_budget < 0
+        or weight_streaming_budget >= WEIGHT_STREAMING_BUDGET_MAX_BYTES
+    ):
+        raise ValueError(
+            "weight_streaming_budget must be in [0, 2**63), got "
+            f"{weight_streaming_budget}."
+        )
+    return str(weight_streaming_budget).encode("ascii")
+
+
+def _resolve_executorch_compile_specs(
+    exp_program: Any,
+    caller_compile_specs: Sequence[Any],
+    weight_streaming_budget: Any,
+) -> List[Any]:
+    """Resolve the compile_specs passed to TensorRTPartitioner.
+
+    Appends the explicit weight streaming budget (from the save() kwarg) as a
+    CompileSpec. When no budget is given nothing is added and the delegate applies
+    TensorRT's automatic budget itself for engines built with weight streaming.
+    Caller-provided compile_specs are forwarded unchanged.
+    """
+    from executorch.exir.backend.compile_spec_schema import CompileSpec
+    from torch_tensorrt.executorch.partitioner import (
+        WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY,
+    )
+
+    specs = list(caller_compile_specs)
+    if any(
+        getattr(spec, "key", None) == WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY
+        for spec in specs
+    ):
+        raise ValueError(
+            f"Do not pass a CompileSpec('{WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY}', "
+            "...) in compile_specs; use the weight_streaming_budget argument of "
+            "save() instead."
+        )
+    spec_value = _normalize_weight_streaming_budget(weight_streaming_budget)
+    if spec_value is None:
+        return specs
+
+    if _count_executorch_engine_nodes(exp_program) > 1:
+        logger.warning(
+            "weight_streaming_budget is an explicit byte budget but the program "
+            "contains multiple TensorRT engines; it is applied to each engine "
+            "independently. Pass None to size each engine's budget automatically."
+        )
+    specs.append(CompileSpec(WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY, spec_value))
+    return specs
+
+
 def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None:
     """Save an ExportedProgram (with TensorRT execute_engine nodes) as an ExecuTorch .pte file.
 
@@ -1318,16 +1423,12 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None
             "\"executorch\" to use output_format='executorch'."
         )
     import torch_tensorrt.dynamo.runtime.meta_ops.register_meta_ops  # noqa: F401
-    from torch_tensorrt.executorch import (
-        TensorRTPartitioner,
-        get_edge_compile_config,
-    )
+    from torch_tensorrt.executorch import get_edge_compile_config, TensorRTPartitioner
 
     extra_partitioners = kwargs.get("partitioners") or []
     if not isinstance(extra_partitioners, (list, tuple)):
         raise TypeError(
-            "partitioners must be a list or tuple when using "
-            "output_format='executorch'"
+            "partitioners must be a list or tuple when using output_format='executorch'"
         )
     # Forward any caller-provided compile_specs to TensorRTPartitioner so users
     # can override the default target_device ("cuda:0") by passing e.g.
@@ -1339,9 +1440,17 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None
             "compile_specs must be a list or tuple when using "
             "output_format='executorch'"
         )
-    partitioners = [
-        TensorRTPartitioner(compile_specs=list(executorch_compile_specs))
-    ] + list(extra_partitioners)
+    # Resolve the weight streaming budget into the compile_specs from the save()
+    # kwarg. If none is given nothing is added and the delegate applies TensorRT's
+    # automatic budget itself for engines built with weight streaming.
+    resolved_compile_specs = _resolve_executorch_compile_specs(
+        exp_program,
+        list(executorch_compile_specs),
+        kwargs.get("weight_streaming_budget"),
+    )
+    partitioners = [TensorRTPartitioner(compile_specs=resolved_compile_specs)] + list(
+        extra_partitioners
+    )
 
     engine_count = _count_executorch_engine_nodes(exp_program)
     if engine_count > 1:
@@ -1377,14 +1486,10 @@ def _normalize_engine_constants_to_python(exp_program: "ExportedProgram") -> Non
     import base64
 
     from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX
-    from torch_tensorrt.dynamo.runtime._TRTEngine import (
-        EngineSerializer,
-        TRTEngine,
-    )
+    from torch_tensorrt.dynamo.runtime._TRTEngine import EngineSerializer, TRTEngine
 
     for fqn, constant in list(exp_program.constants.items()):
         if isinstance(constant, (torch._C.ScriptObject, TRTEngine)):
-
             state = constant.__getstate__()
             if len(state) == 2 and (
                 state[1] == "TRTEngine"
diff --git a/py/torch_tensorrt/executorch/partitioner.py b/py/torch_tensorrt/executorch/partitioner.py
index c7637a90b7..bba89c5538 100644
--- a/py/torch_tensorrt/executorch/partitioner.py
+++ b/py/torch_tensorrt/executorch/partitioner.py
@@ -15,9 +15,9 @@
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import DEVICE_IDX
 from torch_tensorrt.executorch.backend import (
-    TensorRTBackend,
     _get_engine_info_from_edge_program,
     _parse_device_id,
+    TensorRTBackend,
 )
 from torch_tensorrt.executorch.operator_support import TensorRTOperatorSupport
 
@@ -35,6 +35,15 @@
 except ImportError:
     _TARGET_DEVICE_COMPILE_SPEC_KEY = "target_device"
 
+# Compile spec key that carries the TensorRT weight streaming budget into the
+# delegate. Must match kWeightStreamingBudgetKey on the C++ side
+# (cpp/include/torch_tensorrt/executorch/WeightStreamingBudget.h). The value is a
+# non-negative decimal integer of bytes (ASCII). The key is absent for the
+# automatic budget, which the delegate applies itself for streamable engines.
+# The delegate also reads this same key as a load-time backend option (runtime
+# spec), which takes precedence over this baked value when provided at load.
+WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY = "weight_streaming_budget"
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index aa42a07058..e2af4b8c9c 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -66,6 +66,7 @@ test_suite(
     tests = [
         "//tests/cpp/executorch:test_executorch_binding_names",
         "//tests/cpp/executorch:test_executorch_blob_header",
+        "//tests/cpp/executorch:test_executorch_weight_streaming_budget",
     ],
 )
 
diff --git a/tests/cpp/executorch/BUILD b/tests/cpp/executorch/BUILD
index aee13cbcfd..5d1b664659 100644
--- a/tests/cpp/executorch/BUILD
+++ b/tests/cpp/executorch/BUILD
@@ -7,6 +7,7 @@ test_suite(
     tests = [
         ":test_executorch_binding_names",
         ":test_executorch_blob_header",
+        ":test_executorch_weight_streaming_budget",
     ],
 )
 
@@ -27,3 +28,12 @@ cc_test(
         "@googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "test_executorch_weight_streaming_budget",
+    srcs = ["test_executorch_weight_streaming_budget.cpp"],
+    deps = [
+        "//cpp:tensorrt_executorch_weight_streaming_budget",
+        "@googletest//:gtest_main",
+    ],
+)
diff --git a/tests/cpp/executorch/test_executorch_weight_streaming_budget.cpp b/tests/cpp/executorch/test_executorch_weight_streaming_budget.cpp
new file mode 100644
index 0000000000..08edc3ec05
--- /dev/null
+++ b/tests/cpp/executorch/test_executorch_weight_streaming_budget.cpp
@@ -0,0 +1,109 @@
+#include "torch_tensorrt/executorch/WeightStreamingBudget.h"
+
+#include "gtest/gtest.h"
+
+#include <cstddef>
+#include <cstring>
+#include <limits>
+
+namespace torch_tensorrt {
+namespace executorch_backend {
+namespace {
+
+// The compile spec value is a raw byte range; parse a C string literal by length.
+WsBudget parse(const char* s) {
+  return parse_weight_streaming_budget(s, std::strlen(s));
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ParsesZeroBytes) {
+  const WsBudget budget = parse("0");
+  EXPECT_TRUE(budget.valid);
+  EXPECT_EQ(budget.bytes, 0);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ParsesLargeByteCount) {
+  const WsBudget budget = parse("8589934592");
+  EXPECT_TRUE(budget.valid);
+  EXPECT_EQ(budget.bytes, 8589934592LL);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsNegative) {
+  // The budget is a non-negative byte count; negatives are rejected.
+  EXPECT_FALSE(parse("-1").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ParsesInt64Max) {
+  const WsBudget budget = parse("9223372036854775807");
+  EXPECT_TRUE(budget.valid);
+  EXPECT_EQ(budget.bytes, std::numeric_limits<int64_t>::max());
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsLargeNegative) {
+  EXPECT_FALSE(parse("-9223372036854775808").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsOverflow) {
+  // One past the int64_t maximum, so from_chars reports result_out_of_range.
+  EXPECT_FALSE(parse("9223372036854775808").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsEmpty) {
+  EXPECT_FALSE(parse("").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsGarbage) {
+  EXPECT_FALSE(parse("garbage").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsTrailingNonNumeric) {
+  EXPECT_FALSE(parse("12x").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsInternalWhitespace) {
+  EXPECT_FALSE(parse("12 34").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsTrailingWhitespace) {
+  EXPECT_FALSE(parse("42 ").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ParsesNonNulTerminatedBuffer) {
+  // The compile spec value is a raw byte range, not a C string. Place the value
+  // inside a larger buffer with no terminator after it and parse only its bytes.
+  const char raw[] = {'1', '2', '8', 'X', 'Y', 'Z'};
+  const WsBudget budget = parse_weight_streaming_budget(raw, 3);
+  EXPECT_TRUE(budget.valid);
+  EXPECT_EQ(budget.bytes, 128);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ZeroLengthBufferIsMalformed) {
+  EXPECT_FALSE(parse_weight_streaming_budget("0", 0).valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, ParsesLeadingZeros) {
+  // Leading zeros are accepted as long as the value fits in int64.
+  const WsBudget budget = parse("0000000001");
+  EXPECT_TRUE(budget.valid);
+  EXPECT_EQ(budget.bytes, 1);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsEmbeddedNul) {
+  // The compile spec value is a raw byte range, so an interior NUL must not let
+  // the value parse as just the bytes before it.
+  const char raw[] = {'1', '2', '3', '\0', 'x'};
+  EXPECT_FALSE(parse_weight_streaming_budget(raw, sizeof(raw)).valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsLeadingWhitespace) {
+  // from_chars does not skip leading whitespace, so " 42" is rejected.
+  EXPECT_FALSE(parse(" 42").valid);
+}
+
+TEST(ExecuTorchWeightStreamingBudget, RejectsLeadingPlus) {
+  // from_chars does not accept a leading plus sign.
+  EXPECT_FALSE(parse("+42").valid);
+}
+
+} // namespace
+} // namespace executorch_backend
+} // namespace torch_tensorrt
diff --git a/tests/py/dynamo/executorch/test_weight_streaming_budget.py b/tests/py/dynamo/executorch/test_weight_streaming_budget.py
new file mode 100644
index 0000000000..75123041b8
--- /dev/null
+++ b/tests/py/dynamo/executorch/test_weight_streaming_budget.py
@@ -0,0 +1,206 @@
+"""CPU-only tests for the ExecuTorch weight streaming budget option.
+
+These exercise the export-time plumbing: budget validation and the compile spec
+carried into the delegate. The automatic default is applied by the C++ delegate
+at load time, so it is not covered here.
+"""
+
+import logging
+from types import SimpleNamespace
+
+import pytest
+
+executorch = pytest.importorskip("executorch.exir")
+
+import torch  # noqa: E402
+from torch_tensorrt._compile import (  # noqa: E402
+    _normalize_weight_streaming_budget,
+    _resolve_executorch_compile_specs,
+    save,
+)
+from torch_tensorrt.executorch.partitioner import (  # noqa: E402
+    WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY,
+)
+
+_KEY = WEIGHT_STREAMING_BUDGET_COMPILE_SPEC_KEY
+
+
+def _budget_spec(specs):
+    for spec in specs:
+        if spec.key == _KEY:
+            return spec
+    return None
+
+
+@pytest.fixture
+def patch_engine_count(monkeypatch):
+    """Patch the engine-node count so the resolver runs without a real program."""
+
+    def _apply(count=1):
+        monkeypatch.setattr(
+            "torch_tensorrt._compile._count_executorch_engine_nodes",
+            lambda exp_program: count,
+        )
+
+    return _apply
+
+
+# ---------------------------------------------------------------------------
+# _normalize_weight_streaming_budget
+# ---------------------------------------------------------------------------
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "value,expected",
+    [
+        (None, None),
+        (0, b"0"),
+        (8589934592, b"8589934592"),
+    ],
+)
+def test_normalize_valid(value, expected):
+    assert _normalize_weight_streaming_budget(value) == expected
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("value", [-1, -(2**63), 2**63, 2**63 + 5])
+def test_normalize_out_of_range_raises(value):
+    with pytest.raises(ValueError):
+        _normalize_weight_streaming_budget(value)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("value", ["auto", "disabled", "1024"])
+def test_normalize_string_raises(value):
+    # Strings are not accepted; the budget is a non-negative int (or None).
+    with pytest.raises(TypeError):
+        _normalize_weight_streaming_budget(value)
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("value", [True, False])
+def test_normalize_bool_raises(value):
+    with pytest.raises(TypeError):
+        _normalize_weight_streaming_budget(value)
+
+
+@pytest.mark.unit
+def test_normalize_float_raises():
+    with pytest.raises(TypeError):
+        _normalize_weight_streaming_budget(1.5)
+
+
+# ---------------------------------------------------------------------------
+# _resolve_executorch_compile_specs
+# ---------------------------------------------------------------------------
+@pytest.mark.unit
+@pytest.mark.parametrize("budget,expected", [(0, b"0"), (8589934592, b"8589934592")])
+def test_kwarg_injects_compile_spec(patch_engine_count, budget, expected):
+    patch_engine_count(1)
+    specs = _resolve_executorch_compile_specs(SimpleNamespace(), [], budget)
+    spec = _budget_spec(specs)
+    assert spec is not None
+    assert spec.value == expected
+
+
+@pytest.mark.unit
+def test_kwarg_spec_lands_on_delegation_spec(patch_engine_count):
+    from torch_tensorrt.executorch.partitioner import TensorRTPartitioner
+
+    patch_engine_count(1)
+    specs = _resolve_executorch_compile_specs(SimpleNamespace(), [], 8589934592)
+    partitioner = TensorRTPartitioner(compile_specs=specs)
+    spec = _budget_spec(partitioner.delegation_spec.compile_specs)
+    assert spec is not None
+    assert spec.value == b"8589934592"
+
+
+@pytest.mark.unit
+def test_no_spec_injected_without_budget():
+    # No budget: nothing is injected. The delegate applies the automatic budget
+    # itself for streaming-built engines.
+    specs = _resolve_executorch_compile_specs(SimpleNamespace(), [], None)
+    assert _budget_spec(specs) is None
+
+
+@pytest.mark.unit
+def test_caller_compile_specs_passed_through():
+    # Non-budget caller compile_specs are forwarded unchanged.
+    sentinel = SimpleNamespace(key="target_device", value=b"cuda:1")
+    specs = _resolve_executorch_compile_specs(SimpleNamespace(), [sentinel], None)
+    assert sentinel in specs
+    assert _budget_spec(specs) is None
+
+
+@pytest.mark.unit
+def test_caller_budget_spec_in_compile_specs_raises():
+    # The budget must come from the kwarg, not a manually-pinned compile spec.
+    spec = SimpleNamespace(key=_KEY, value=b"4096")
+    with pytest.raises(ValueError):
+        _resolve_executorch_compile_specs(SimpleNamespace(), [spec], None)
+
+
+# ---------------------------------------------------------------------------
+# Multi-engine warning
+# ---------------------------------------------------------------------------
+@pytest.mark.unit
+def test_multi_engine_explicit_warns(patch_engine_count, caplog):
+    patch_engine_count(2)
+    with caplog.at_level(logging.WARNING, logger="torch_tensorrt._compile"):
+        _resolve_executorch_compile_specs(SimpleNamespace(), [], 4096)
+    assert "multiple TensorRT engines" in caplog.text
+
+
+@pytest.mark.unit
+def test_multi_engine_none_does_not_warn(patch_engine_count, caplog):
+    patch_engine_count(2)
+    with caplog.at_level(logging.WARNING, logger="torch_tensorrt._compile"):
+        specs = _resolve_executorch_compile_specs(SimpleNamespace(), [], None)
+    assert _budget_spec(specs) is None
+    assert "multiple TensorRT engines" not in caplog.text
+
+
+# ---------------------------------------------------------------------------
+# save() entry-point guards
+# ---------------------------------------------------------------------------
+@pytest.mark.unit
+def test_save_rejects_bool_budget(tmp_path):
+    with pytest.raises(TypeError):
+        save(
+            torch.nn.Linear(1, 1),
+            str(tmp_path / "model.pte"),
+            output_format="executorch",
+            weight_streaming_budget=True,
+        )
+
+
+@pytest.mark.unit
+def test_save_rejects_string_budget(tmp_path):
+    with pytest.raises(TypeError):
+        save(
+            torch.nn.Linear(1, 1),
+            str(tmp_path / "model.pte"),
+            output_format="executorch",
+            weight_streaming_budget="auto",
+        )
+
+
+@pytest.mark.unit
+def test_save_rejects_negative_budget(tmp_path):
+    with pytest.raises(ValueError):
+        save(
+            torch.nn.Linear(1, 1),
+            str(tmp_path / "model.pte"),
+            output_format="executorch",
+            weight_streaming_budget=-1,
+        )
+
+
+@pytest.mark.unit
+def test_save_rejects_unknown_executorch_kwarg(tmp_path):
+    with pytest.raises(TypeError, match="unexpected keyword argument"):
+        save(
+            torch.nn.Linear(1, 1),
+            str(tmp_path / "model.pte"),
+            output_format="executorch",
+            weight_streaming_budgett=4096,
+        )