NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 21 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 27 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 8 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 15 additions & 4 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 9 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 9 additions & 1 deletion
@@ -14,7 +14,7 @@
                                           EagleDecodingConfig, KvCacheConfig,
                                           MTPDecodingConfig, PeftCacheConfig,
                                           SamplerType, SchedulerConfig,
-                                          SparseAttentionConfig,
+                                          SmDisaggConfig, SparseAttentionConfig,
                                           SpeculativeConfig, TorchLlmArgs)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_helper import (LoraConfig,
@@ -38,7 +38,7 @@
 from .sampler import (EarlyStopSampler, EarlyStopWithMMResult, TorchSampler,
                       TRTLLMSampler)
 from .scheduler import (BindCapacityScheduler, BindMicroBatchScheduler,
-                        SimpleScheduler)
+                        SimpleScheduler, SmDisaggCtxScheduler)
 from .seq_slot_manager import SeqSlotManager
 
 GB = 1 << 30
@@ -665,6 +665,8 @@ def create_py_executor_instance(
     max_batch_size: Optional[int] = None,
     max_beam_width: Optional[int] = None,
     max_num_tokens: Optional[int] = None,
+    ctx_model_engine: Optional[PyTorchModelEngine] = None,
+    sm_disagg_config: Optional[SmDisaggConfig] = None,
     peft_cache_config: Optional[PeftCacheConfig] = None,
     scheduler_config: Optional[SchedulerConfig] = None,
     cache_transceiver_config: Optional[CacheTransceiverConfig] = None,
@@ -789,6 +791,21 @@ def create_py_executor_instance(
                                            ctx_chunk_config)
     scheduler = SimpleScheduler(capacity_scheduler, mb_scheduler)
 
+    if sm_disagg_config is not None:
+        scheduler_capacity += sm_disagg_config.context_max_batch_size * mapping.pp_size
+        capacity_scheduler = BindCapacityScheduler(
+            scheduler_capacity,
+            kv_cache_manager.impl if kv_cache_manager is not None else None,
+            peft_cache_manager.impl if peft_cache_manager is not None else None,
+            scheduler_config.capacity_scheduler_policy,
+            two_step_lookahead=mapping.has_pp())
+        mb_scheduler = BindMicroBatchScheduler(
+            sm_disagg_config.context_max_batch_size,
+            sm_disagg_config.context_max_num_tokens, ctx_chunk_config)
+        ctx_scheduler = SmDisaggCtxScheduler(capacity_scheduler, mb_scheduler)
+    else:
+        ctx_scheduler = None
+
     config = model_engine.model.model_config.pretrained_config
     attention_type = AttentionTypeCpp.MLA if is_mla(
         config) else AttentionTypeCpp.DEFAULT
@@ -801,6 +818,8 @@ def create_py_executor_instance(
         model_engine=model_engine,
         sampler=sampler,
         drafter=drafter,
+        ctx_scheduler=ctx_scheduler,
+        ctx_model_engine=ctx_model_engine,
         dist=dist,
         max_num_sequences=max_num_sequences,
         disable_overlap_scheduler=llm_args.disable_overlap_scheduler,
 
@@ -50,7 +50,8 @@ class ExecutorRequestQueue:
     def __init__(self, dist: Distributed, enable_attention_dp: bool,
                  max_batch_size: int, max_beam_width: int,
                  max_num_active_requests: int, enable_iter_perf_stats: bool,
-                 batch_wait_timeout_ms: float, is_disaggregated: bool):
+                 batch_wait_timeout_ms: float, is_disaggregated: bool,
+                 is_sm_disagg: bool):
         self.dist = dist
         self.request_queue: queue.Queue[RequestQueueItem] = queue.Queue()
         self.waiting_queue: deque[RequestQueueItem] = deque()
@@ -60,6 +61,7 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
         self.max_beam_width = max_beam_width
         self.max_num_active_requests = max_num_active_requests
         self.is_disaggregated = is_disaggregated
+        self.is_sm_disagg = is_sm_disagg
         self.enqueue_lock = threading.Lock()
         self.next_request_id = max_batch_size
         self.enable_iter_perf_stats = enable_iter_perf_stats
@@ -333,13 +335,35 @@ def _fetch_and_process_requests(
 
     @nvtx_range("_fetch_new_requests")
     def fetch_new_requests(
-            self, activate_requests: List[LlmRequest]) -> List[LlmRequest]:
+            self, activate_requests: List[LlmRequest],
+            num_active_requests_on_engine: int) -> List[LlmRequest]:
 
-        if self.enable_attention_dp:
+        if self.is_sm_disagg:
+            return self._fetch_new_requests_sm_disagg(
+                len(activate_requests), num_active_requests_on_engine)
+        elif self.enable_attention_dp:
             return self._fetch_new_requests_attention_dp(activate_requests)
         else:
             return self._fetch_new_requests_attention_tp(len(activate_requests))
 
+    def _fetch_new_requests_sm_disagg(
+            self, num_active_requests: int,
+            num_active_requests_on_engine: int) -> List[LlmRequest]:
+        """Handle SM-level disaggregation request fetching."""
+        total_max_num_active_requests = (self.max_num_active_requests +
+                                         num_active_requests -
+                                         num_active_requests_on_engine)
+
+        # fetch and process requests into waiting queue
+        new_requests = self._fetch_and_process_requests(
+            num_active_requests_on_engine,
+            total_max_num_active_requests,
+            enable_attention_dp=False)
+
+        # Merge requests and add to active list
+        merged_requests = self._merge_requests(new_requests)
+        return merged_requests
+
     def _fetch_new_requests_attention_tp(
             self, num_active_requests: int) -> List[LlmRequest]:
         """Handle standard (non-attention DP) request fetching."""
 
@@ -797,3 +797,11 @@ def get_draft_token_length(request: LlmRequest) -> int:
     if request.py_draft_tokens is not None:
         return len(request.py_draft_tokens)
     return 0
+
+
+def get_context_requests(requests: List[LlmRequest]):
+    return [req for req in requests if req.is_context_init_state]
+
+
+def get_generation_requests(requests: List[LlmRequest]):
+    return [req for req in requests if not req.is_context_init_state]
@@ -136,10 +136,12 @@ def __init__(
         attn_runtime_features: Optional[AttentionRuntimeFeatures] = None,
         dist: Optional[MPIDist] = None,
         spec_config: Optional["DecodingBaseConfig"] = None,
+        is_sm_disagg_ctx_phase: bool = False,
         is_draft_model: bool = False,
         drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
                                                  torch.nn.Module]] = None,
         model: Optional[torch.nn.Module] = None,
+        weight_sharing_model: Optional[torch.nn.Module] = None,
     ):
         self.forward_pass_callable = None
         self.ub_buffers = None
@@ -149,6 +151,9 @@ def __init__(
             max_seq_len,
             max_batch_size,
         ) = llm_args.get_runtime_sizes()
+        if is_sm_disagg_ctx_phase:
+            max_num_tokens = llm_args.sm_disagg_config.context_max_num_tokens
+            max_batch_size = llm_args.sm_disagg_config.context_max_batch_size
 
         self.batch_size = max_batch_size
         self.max_num_tokens = max_num_tokens
@@ -166,6 +171,7 @@ def __init__(
         if dist is not None:
             ExpertStatistic.create(self.dist.rank)
         self.llm_args = llm_args
+        self.sm_disagg_enabled = llm_args.sm_disagg_config is not None
         self.original_max_draft_len = spec_config.max_draft_len if spec_config is not None else 0
         self.original_max_total_draft_tokens = spec_config.max_total_draft_tokens if spec_config is not None else 0
 
@@ -196,6 +202,7 @@ def __init__(
                 max_num_tokens=self.max_num_tokens,
                 max_seq_len=self.max_seq_len,
                 lora_config=lora_config,
+                weight_sharing_model=weight_sharing_model,
             )
             self.model, moe_load_balancer = loader.load(
                 checkpoint_dir=model_path, checkpoint_loader=checkpoint_loader)
@@ -1352,8 +1359,10 @@ def _prepare_tp_inputs(
             # the request has no previous tensor:
             # (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or
             # (2) a dummy request; or
-            # (3) the first step in the generation server of disaggregated serving
-            if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None:
+            # (3) the first step in the generation server of disaggregated serving; or
+            # (4) the first step in the generation phase of SM-level disaggregation
+            if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None \
+                    or self.sm_disagg_enabled and request.max_num_generated_tokens == 0:
                 # get token ids, including input token ids and draft token ids. For these dummy requests,
                 # no need to copy the token ids.
                 if not (request.is_attention_dp_dummy
@@ -1456,8 +1465,10 @@ def _prepare_tp_inputs(
                 # the request has no previous tensor:
                 # (1) new_tokens_device is None, which means overlap scheduler is disabled; or
                 # (2) a dummy request; or
-                # (3) the first step in the generation server of disaggregated serving
-                if new_tokens_device is None or request.is_dummy or request.py_batch_idx is None:
+                # (3) the first step in the generation server of disaggregated serving; or
+                # (4) the first step in the generation phase of SM-level disaggregation
+                if new_tokens_device is None or request.is_dummy or request.py_batch_idx is None \
+                        or self.sm_disagg_enabled and request.max_num_generated_tokens == 0:
                     # skip adding input_ids of CUDA graph dummy requests so that new_tokens_device
                     # can be aligned to the correct positions.
                     if not request.is_cuda_graph_dummy:
 
@@ -191,7 +191,8 @@ def __init__(self,
                  sparse_attention_config: Optional["SparseAttentionConfig"],
                  max_num_tokens: int,
                  max_seq_len: Optional[int],
-                 lora_config: Optional[LoraConfig] = None):
+                 lora_config: Optional[LoraConfig] = None,
+                 weight_sharing_model: Optional[torch.nn.Module] = None):
         """
         Initializes the ModelLoader.
 
@@ -210,6 +211,7 @@ def __init__(self,
         self.max_num_tokens = max_num_tokens
         self.max_seq_len = max_seq_len
         self.lora_config = lora_config
+        self.weight_sharing_model = weight_sharing_model
 
     def load(
         self,
@@ -307,6 +309,12 @@ def init_meta_tensor(t: torch.Tensor):
                 moe_load_balancer.finalize_model()
                 logger.info("moe_load_balancer finalize model done")
 
+            if self.weight_sharing_model is not None:
+                model.load_state_dict(self.weight_sharing_model.state_dict(),
+                                      assign=True)
+                # Free up duplicate model weights allocated before weight sharing
+                torch.cuda.empty_cache()
+
             torch.cuda.current_stream().synchronize()
 
         return model, moe_load_balancer