Skip to content

Commit 9627d74

Browse files
Tabriziancodego7250
authored andcommitted
[None][feat] Add support for KVCache reuse for DSv32 (NVIDIA#9383)
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
1 parent a4b2e4f commit 9627d74

File tree

7 files changed

+14
-38
lines changed

7 files changed

+14
-38
lines changed

cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ class BlockRange
183183
auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
184184
mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
185185
}
186+
if (cacheManager.isEnableIndexerKCache())
187+
{
188+
mIndexerKCachePool = cacheManager.getIndexerKCachePool();
189+
}
186190
}
187191

188192
BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)

cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ class CacheReceiver::Impl
806806

807807
RequestInfo requestInfo(requestId, mSelfState);
808808

809-
if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1)
809+
if (!mFormatter->getCacheManager()->getBlockManager().isVariableWindow())
810810
{
811811
auto* cacheManager = mFormatter->getCacheManager();
812812
auto beam = 0;

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -876,14 +876,7 @@ void WindowBlockManager::allocatePools(bool useUvm)
876876
}
877877

878878
nvinfer1::Dims cacheShape;
879-
if (pool.containsIndexerKCache)
880-
{
881-
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, blockSize});
882-
}
883-
else
884-
{
885-
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
886-
}
879+
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
887880

888881
TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(),
889882
mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads);

examples/models/core/deepseek_v3/README.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -881,12 +881,3 @@ python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --enable_chunked_pref
881881
- **GPU Memory:** Adjust `--max_batch_size` and `--max_num_tokens` if you encounter out-of-memory errors.
882882
- **Logs:** Check `/workspace/trt_bench.log` for detailed performance information and troubleshooting messages.
883883
- **Configuration Files:** Verify that the configuration files are correctly formatted to avoid runtime issues.
884-
885-
## Known Issues
886-
- Support for KV Cache Reuse and Chunked Prefill in DeepSeek-V3.2-Exp is currently under development. When running `quickstart_advanced.py`, please include `--disable_kv_cache_reuse` to disable KV Cache Reuse. When using `trtllm-eval`/`trtllm-serve`/`trtllm-bench`, please include the following configuration in the extra llm_api options:
887-
```
888-
kv_cache_config:
889-
enable_block_reuse: false
890-
tokens_per_block: 64
891-
enable_chunked_prefill: false
892-
```

tensorrt_llm/_torch/attention_backend/sparse/dsa.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -930,15 +930,15 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
930930
start_idx=0,
931931
)
932932

933-
if len(chunk_groups) > 1:
933+
if len(chunk_groups
934+
) > 1 or metadata.enable_context_mla_with_cached_kv:
934935
metadata.indexer_prefill_chunks = [
935936
Indexer.prepare_one_prefill_chunk(
936937
metadata,
937938
chunk_specs,
938939
) for chunk_specs in chunk_groups
939940
]
940941
else:
941-
# Single chunk - use non-chunked fallback path
942942
metadata.indexer_prefill_chunks = None
943943

944944
host_cu_seqlen_ks, host_cu_seqlen_ke = compute_cu_seqlen_kv_bounds_with_cache(
@@ -1018,9 +1018,9 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
10181018
metadata.slot_mapping_scale[:total_tokens].copy_(
10191019
metadata.host_slot_mapping_scale[:total_tokens], non_blocking=True)
10201020

1021-
# Only when MLA chunked prefill is enabled, we need to gather the full KV for indexer's logit computation.
1021+
# When chunked prefill or KVCache reuse is enabled, we need to gather the full KV for indexer's logit computation.
10221022
# Indexer's own chunking does not need full KV gathering, instead it gathers only the current chunk with loop-based gathering.
1023-
_need_full_kv_gathering = num_contexts > 0 and has_mla_chunked_prefill
1023+
_need_full_kv_gathering = num_contexts > 0 and metadata.enable_context_mla_with_cached_kv
10241024
if _need_full_kv_gathering:
10251025
total_kv_len = metadata.host_ctx_kv_indptr[num_contexts].item()
10261026
total_kv_per_request = seq_lens[:
@@ -1589,10 +1589,6 @@ def __init__(
15891589
sparse_attn_config: "SparseAttentionConfig",
15901590
**kwargs,
15911591
) -> None:
1592-
1593-
if kv_cache_config.enable_block_reuse:
1594-
raise NotImplementedError(
1595-
"DSA indexer K-cache manager does not support block reuse yet")
15961592
self.quant_block_size = 128
15971593
self.index_head_dim = sparse_attn_config.index_head_dim
15981594
# Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,6 @@ def test_auto_dtype(self, overlap_scheduler):
10551055
ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"}
10561056
gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"}
10571057
ctx_server_config["kv_cache_config"] = {
1058-
"enable_block_reuse": False,
10591058
"free_gpu_memory_fraction": 0.7,
10601059
"tokens_per_block": 64,
10611060
"dtype": "fp8"
@@ -1072,7 +1071,6 @@ def test_auto_dtype(self, overlap_scheduler):
10721071
ctx_server_config["enable_attention_dp"] = True
10731072
ctx_server_config["enable_autotuner"] = False
10741073
gen_server_config["kv_cache_config"] = {
1075-
"enable_block_reuse": False,
10761074
"tokens_per_block": 64,
10771075
"free_gpu_memory_fraction": 0.7,
10781076
"dtype": "fp8"

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2597,17 +2597,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
25972597
if get_sm_version() == 100 or get_sm_version() == 103:
25982598
moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend
25992599
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2600-
# TODO: Support block reuse for DeepSeek-V3.2
2601-
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
2602-
free_gpu_memory_fraction=0.6,
2600+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
26032601
tokens_per_block=64)
26042602
else:
26052603
if moe_backend != "_DEFAULT":
26062604
pytest.skip("Not supported MoE backend!")
26072605
moe_config = MoeConfig()
2608-
# TODO: Support block reuse for DeepSeek-V3.2
2609-
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
2610-
free_gpu_memory_fraction=0.7,
2606+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
26112607
tokens_per_block=64)
26122608

26132609
pytorch_config = dict(
@@ -2670,8 +2666,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
26702666
"MOE TRTLLM backend does not support SM version 120 or 121")
26712667

26722668
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2673-
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
2674-
free_gpu_memory_fraction=0.7,
2669+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
26752670
tokens_per_block=64)
26762671
cuda_graph_config = CudaGraphConfig(
26772672
enable_padding=True,
@@ -2730,8 +2725,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
27302725
"MOE TRTLLM backend does not support SM version 120 or 121")
27312726

27322727
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2733-
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
2734-
free_gpu_memory_fraction=0.7,
2728+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
27352729
tokens_per_block=64)
27362730
cuda_graph_config = CudaGraphConfig(
27372731
enable_padding=True,

0 commit comments

Comments
 (0)