@@ -351,6 +351,18 @@ def allocation_scope(current_stage: ExecutorMemoryType,
351351 validate_feature_combination (llm_args , model_engine , llm_args .sampler_type )
352352
353353 if llm_args .sm_disagg_config is not None :
354+ if llm_args .cache_transceiver_config is not None :
355+ raise ValueError (
356+ "SM-level disaggregation is not compatible with disaggregated serving."
357+ )
358+ if llm_args .parallel_config .world_size > 1 :
359+ raise NotImplementedError (
360+ "SM-level disaggregation is not supported with parallelism." )
361+ if scheduler_config .capacity_scheduler_policy != CapacitySchedulerPolicy .GUARANTEED_NO_EVICT :
362+ raise NotImplementedError (
363+ "SM-level disaggregation is only supported with guaranteed no evict scheduler policy."
364+ )
365+
354366 with allocation_scope (ExecutorMemoryType .MODEL_ENGINE_CTX ,
355367 RestoreMode .PINNED ):
356368 ctx_llm_args = copy .copy (llm_args )
@@ -367,23 +379,6 @@ def allocation_scope(current_stage: ExecutorMemoryType,
367379 else :
368380 ctx_model_engine = None
369381
370- if llm_args .sm_disagg_config is not None :
371- with allocation_scope (ExecutorMemoryType .MODEL_ENGINE_CTX ,
372- RestoreMode .PINNED ):
373- ctx_backend_config = copy .copy (pytorch_backend_config )
374- ctx_backend_config .use_cuda_graph = False
375- ctx_model_engine = PyTorchModelEngine (
376- model_path = checkpoint_dir ,
377- llm_args = llm_args ,
378- mapping = mapping ,
379- attn_runtime_features = attn_runtime_features ,
380- dist = dist ,
381- spec_config = spec_config ,
382- weight_sharing_model = model_engine .model ,
383- )
384- else :
385- ctx_model_engine = None
386-
387382 if has_draft_model_engine :
388383 with allocation_scope (ExecutorMemoryType .MODEL_ENGINE_DRAFT ,
389384 RestoreMode .PINNED ):
0 commit comments