fix reviewer commits

qiangxu1996 · qiangxu1996 · commit 6e83433f64df · 2025-11-13T19:33:43.000Z
Signed-off-by: Qiang Xu &lt;qiangx@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -29,7 +29,7 @@
 from .guided_decoder import GuidedDecoder
 from .kv_cache_connector import KvCacheConnectorManager
 from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver
-from .llm_request import ExecutorResponse
+from .llm_request import ExecutorResponse, LlmRequestState
 from .mamba_cache_manager import MambaHybridCacheManager
 from .model_engine import PyTorchModelEngine
 from .py_executor import PyExecutor
@@ -38,7 +38,7 @@
 from .sampler import (EarlyStopSampler, EarlyStopWithMMResult, TorchSampler,
                       TRTLLMSampler)
 from .scheduler import (BindCapacityScheduler, BindMicroBatchScheduler,
-                        SimpleScheduler, SmDisaggCtxScheduler)
+                        SimpleScheduler)
 from .seq_slot_manager import SeqSlotManager
 
 GB = 1 << 30
@@ -801,8 +801,10 @@ def create_py_executor_instance(
             two_step_lookahead=mapping.has_pp())
         mb_scheduler = BindMicroBatchScheduler(
             sm_disagg_config.context_max_batch_size,
-            sm_disagg_config.context_max_num_tokens, ctx_chunk_config)
-        ctx_scheduler = SmDisaggCtxScheduler(capacity_scheduler, mb_scheduler)
+            sm_disagg_config.context_max_num_tokens,
+            ctx_chunk_config,
+            no_schedule_after_state=LlmRequestState.GENERATION_IN_PROGRESS)
+        ctx_scheduler = SimpleScheduler(capacity_scheduler, mb_scheduler)
     else:
         ctx_scheduler = None
 
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -342,31 +342,12 @@ def fetch_new_requests(
             self, activate_requests: List[LlmRequest],
             num_active_requests_on_engine: int) -> List[LlmRequest]:
 
-        if self.is_sm_disagg:
-            return self._fetch_new_requests_sm_disagg(
-                len(activate_requests), num_active_requests_on_engine)
-        elif self.enable_attention_dp:
+        if self.enable_attention_dp:
             return self._fetch_new_requests_attention_dp(activate_requests)
         else:
-            return self._fetch_new_requests_attention_tp(len(activate_requests))
-
-    def _fetch_new_requests_sm_disagg(
-            self, num_active_requests: int,
-            num_active_requests_on_engine: int) -> List[LlmRequest]:
-        """Handle SM-level disaggregation request fetching."""
-        total_max_num_active_requests = (self.max_num_active_requests +
-                                         num_active_requests -
-                                         num_active_requests_on_engine)
-
-        # fetch and process requests into waiting queue
-        new_requests = self._fetch_and_process_requests(
-            num_active_requests_on_engine,
-            total_max_num_active_requests,
-            enable_attention_dp=False)
-
-        # Merge requests and add to active list
-        merged_requests = self._merge_requests(new_requests)
-        return merged_requests
+            num_active_requests = num_active_requests_on_engine if self.is_sm_disagg else len(
+                activate_requests)
+            return self._fetch_new_requests_attention_tp(num_active_requests)
 
     def _fetch_new_requests_attention_tp(
             self, num_active_requests: int) -> List[LlmRequest]:
diff --git a/tensorrt_llm/_torch/pyexecutor/green_ctx.py b/tensorrt_llm/_torch/pyexecutor/green_ctx.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from cuda.bindings import driver
+
+from tensorrt_llm.runtime.generation import CUASSERT
+
+
+def green_ctx_create_streams(res_list, device):
+    streams = []
+    for res in res_list:
+        desc = CUASSERT(driver.cuDevResourceGenerateDesc([res], 1))[0]
+        green_ctx = CUASSERT(
+            driver.cuGreenCtxCreate(
+                desc, device, driver.CUgreenCtxCreate_flags.CU_GREEN_CTX_DEFAULT_STREAM
+            )
+        )[0]
+        stream = CUASSERT(
+            driver.cuGreenCtxStreamCreate(
+                green_ctx, driver.CUstream_flags.CU_STREAM_NON_BLOCKING, 0
+            )
+        )[0]
+        stream = torch.cuda.get_stream_from_external(stream, device)
+        streams.append(stream)
+    return streams
+
+
+def green_ctx_split_percent(sm_percent: float, device_id: int = 0):
+    device = CUASSERT(driver.cuDeviceGet(device_id))[0]
+
+    res = CUASSERT(
+        driver.cuDeviceGetDevResource(device, driver.CUdevResourceType.CU_DEV_RESOURCE_TYPE_SM)
+    )[0]
+    sm_count = res.sm.smCount
+
+    major = CUASSERT(
+        driver.cuDeviceGetAttribute(
+            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device
+        )
+    )[0]
+    if major >= 9:
+        sm_min = 8
+        sm_align = 8
+    else:
+        sm_min = 4 if major == 8 else 2
+        sm_align = 2
+
+    def green_ctx_split_aligned(sm_g1):
+        sm_g1 = round(sm_g1 / sm_align) * sm_align
+        sm_g1 = min(max(sm_g1, sm_min), sm_count - sm_min)
+        result = CUASSERT(
+            driver.cuDevSmResourceSplitByCount(
+                1,  # nbGroups
+                res,
+                0,  # useFlags
+                sm_g1,
+            )
+        )
+        res_split = (result[0][0], result[2])
+        streams = green_ctx_create_streams(res_split, device)
+        return streams, res_split
+
+    sm_g1 = round(sm_count * sm_percent)
+    sm_g2 = sm_count - sm_g1
+    # Choose the split closer to sm_percent when sm_count is not divisible by sm_align
+    sm_g1_dist = min(sm_g1 % sm_align, sm_align - (sm_g1 % sm_align))
+    sm_g2_dist = min(sm_g2 % sm_align, sm_align - (sm_g2 % sm_align))
+    if sm_g1_dist <= sm_g2_dist:
+        (stream_g1, stream_g2), (res_g1, res_g2) = green_ctx_split_aligned(sm_g1)
+    else:
+        (stream_g2, stream_g1), (res_g2, res_g1) = green_ctx_split_aligned(sm_g2)
+    return (stream_g1, stream_g2), (res_g1, res_g2)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -42,6 +42,7 @@
 from ..speculative.mtp import SampleStateTensorsMTP
 from ..speculative.speculation_gate import SpeculationGate
 from .executor_request_queue import ExecutorRequestQueue, RequestQueueItem
+from .green_ctx import green_ctx_split_percent
 from .guided_decoder import GuidedDecoder
 from .handle_additional_outputs import HandleAdditionalOutputs
 from .handle_logits import HandleLogits
@@ -215,9 +216,10 @@ def __init__(self,
         self.responses = {}
         self.result_wait_queues = {}
 
-        self.sm_disagg_lock = threading.Lock()
-        self.ctx_request_cv = threading.Condition(self.sm_disagg_lock)
-        self.gen_request_cv = threading.Condition(self.sm_disagg_lock)
+        if self.ctx_model_engine is not None:
+            self.sm_disagg_lock = threading.Lock()
+            self.ctx_request_cv = threading.Condition(self.sm_disagg_lock)
+            self.gen_request_cv = threading.Condition(self.sm_disagg_lock)
 
         # kv cache events
         self.kv_cache_manager = self.resource_manager.resource_managers.get(
@@ -229,6 +231,9 @@ def __init__(self,
         self.max_input_len = max_input_len
         # _executor_loop private data
         self.max_num_active_requests = model_engine.get_max_num_sequences()
+        if self.ctx_model_engine is not None:
+            self.max_num_active_requests += ctx_model_engine.get_max_num_sequences(
+            )
         self.active_requests: List[LlmRequest] = []
         self.expected_num_active_requests = 0
         self.ctx_in_transmission_requests = dict()
@@ -1694,7 +1699,11 @@ def _executor_loop_sm_disagg_gen_overlap(self, stream):
                         iter_stats=iter_stats)
 
     def _executor_loop_sm_disagg(self):
-        stream_ctx, stream_gen = self.split_device_green_ctx()
+        (stream_ctx, stream_gen), (res_ctx, res_gen) = green_ctx_split_percent(
+            self.sm_disagg_ctx_sm_percent, self.device_id)
+        logger.info(
+            f"Green contexts allocated {res_ctx.sm.smCount} SMs for context phase and {res_gen.sm.smCount} SMs for generation phase."
+        )
 
         thread_ctx = threading.Thread(target=self._executor_loop_sm_disagg_ctx,
                                       args=(stream_ctx, ),
@@ -1705,42 +1714,6 @@ def _executor_loop_sm_disagg(self):
 
         thread_ctx.join()
 
-    def split_device_green_ctx(self):
-        device = torch.device("cuda", self.device_id)
-        device_properties = torch.cuda.get_device_properties(device)
-        sm_count = device_properties.multi_processor_count
-        if device_properties.major >= 9:
-            sm_min = 8
-            sm_align = 8
-        else:
-            sm_min = 4 if device_properties.major == 8 else 2
-            sm_align = 2
-
-        from flashinfer import green_ctx
-
-        def split_device_green_ctx_aligned(sm_s1):
-            sm_s1 = round(sm_s1 / sm_align) * sm_align
-            sm_s1 = min(max(sm_s1, sm_min), sm_count - sm_min)
-            return green_ctx.split_device_green_ctx_by_sm_count(device, [sm_s1])
-
-        sm_ctx = round(sm_count * self.sm_disagg_ctx_sm_percent)
-        sm_gen = sm_count - sm_ctx
-        # Choose the split closer to user-specified percentage when sm_count is not divisible by sm_align
-        sm_ctx_dist = min(sm_ctx % sm_align, sm_align - (sm_ctx % sm_align))
-        sm_gen_dist = min(sm_gen % sm_align, sm_align - (sm_gen % sm_align))
-        if sm_gen_dist < sm_ctx_dist:
-            (stream_gen,
-             stream_ctx), (res_gen,
-                           res_ctx) = split_device_green_ctx_aligned(sm_gen)
-        else:
-            (stream_ctx,
-             stream_gen), (res_ctx,
-                           res_gen) = split_device_green_ctx_aligned(sm_ctx)
-        logger.info(
-            f"Green contexts allocated {res_ctx.sm.smCount} SMs for context phase and {res_gen.sm.smCount} SMs for generation phase."
-        )
-        return stream_ctx, stream_gen
-
     def _accept_draft_tokens(
         self, scheduled_batch: ScheduledRequests,
         target_outputs: SampleStateTensors,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -374,6 +374,7 @@ def allocation_scope(current_stage: ExecutorMemoryType,
                 attn_runtime_features=attn_runtime_features,
                 dist=dist,
                 spec_config=spec_config,
+                is_sm_disagg_ctx_phase=True,
                 weight_sharing_model=model_engine.model,
             )
     else:
diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler.py
@@ -7,7 +7,7 @@
 from tensorrt_llm.bindings import internal as tb_internal
 from tensorrt_llm.llmapi.llm_args import CapacitySchedulerPolicy
 
-from .llm_request import LlmRequest, LlmRequestState, get_context_requests
+from .llm_request import LlmRequest, LlmRequestState
 
 RequestList = list[LlmRequest]
 
@@ -79,6 +79,9 @@ def __init__(
         scheduler_policy: CapacitySchedulerPolicy = CapacitySchedulerPolicy.
         GUARANTEED_NO_EVICT,
         two_step_lookahead: bool = False,
+        no_schedule_until_state: LlmRequestState = LlmRequestState.CONTEXT_INIT,
+        no_schedule_after_state: LlmRequestState = LlmRequestState.
+        GENERATION_COMPLETE,
     ):
         super(BindCapacityScheduler, self).__init__()
         self.kv_cache_manager = kv_cache_manager
@@ -89,8 +92,8 @@ def __init__(
             capacity_scheduler_policy=scheduler_policy._to_pybind(),
             has_kv_cache_manager=kv_cache_manager is not None,
             two_step_lookahead=two_step_lookahead,
-            no_schedule_until_state=LlmRequestState.CONTEXT_INIT,
-            no_schedule_after_state=LlmRequestState.GENERATION_COMPLETE)
+            no_schedule_until_state=no_schedule_until_state,
+            no_schedule_after_state=no_schedule_after_state)
 
     def schedule_request(
         self, active_requests: RequestList
@@ -175,6 +178,9 @@ def __init__(
         max_batch_size: int,
         max_num_tokens: int = None,
         ctx_chunk_config: Optional[Tuple[StrEnum, int]] = None,
+        no_schedule_until_state: LlmRequestState = LlmRequestState.CONTEXT_INIT,
+        no_schedule_after_state: LlmRequestState = LlmRequestState.
+        GENERATION_COMPLETE,
     ) -> None:
         super(BindMicroBatchScheduler, self).__init__()
         self.max_batch_size = max_batch_size
@@ -186,7 +192,8 @@ def __init__(
                 ctx_chunk_config[0]._to_pybind(), ctx_chunk_config[1])
 
         self.impl = tb_internal.algorithms.MicroBatchScheduler(
-            ctx_chunk_config_cpp, max_num_tokens)
+            ctx_chunk_config_cpp, max_num_tokens, no_schedule_until_state,
+            no_schedule_after_state)
 
     def schedule(
         self, active_requests: RequestList, inflight_request_ids: set[int]
@@ -216,28 +223,3 @@ def schedule_request(self, active_requests: RequestList,
                                list(generation_requests), list(paused_requests),
                                list(fitting_disagg_gen_init_requests),
                                len(fitting_requests))
-
-
-class SmDisaggCtxScheduler(RequestScheduler):
-
-    def __init__(self, capacity_scheduler: CapacityScheduler,
-                 micro_batch_scheduler: MicroBatchScheduler):
-        super(SmDisaggCtxScheduler, self).__init__()
-        self.capacity_scheduler = capacity_scheduler
-        self.micro_batch_scheduler = micro_batch_scheduler
-
-    def schedule_request(self, active_requests: RequestList,
-                         inflight_request_ids: set[int]) -> SchedulerOutput:
-        fitting_requests, fitting_disagg_gen_init_requests, paused_requests = self.capacity_scheduler.schedule_request(
-            active_requests)
-
-        fitting_requests = get_context_requests(fitting_requests)
-
-        context_requests, generation_requests = self.micro_batch_scheduler.schedule(
-            fitting_requests, inflight_request_ids)
-        # Convert from binding type RequestVector to list[LlmRequest],
-        # so Python fields on LlmRequest won't be stripped away
-        return SchedulerOutput(list(context_requests),
-                               list(generation_requests), list(paused_requests),
-                               list(fitting_disagg_gen_init_requests),
-                               len(fitting_requests))

Original file line number	Diff line number	Diff line change
`@@ -374,6 +374,7 @@ def allocation_scope(current_stage: ExecutorMemoryType,`
`374`	`374`	`attn_runtime_features=attn_runtime_features,`
`375`	`375`	`dist=dist,`
`376`	`376`	`spec_config=spec_config,`
	`377`	`+ is_sm_disagg_ctx_phase=True,`
`377`	`378`	`weight_sharing_model=model_engine.model,`
`378`	`379`	`)`
`379`	`380`	`else:`