From 3a3b5e474689520fdf53b82a5b4949d70fe6de0e Mon Sep 17 00:00:00 2001
From: jenchen13 <jennifchen@nvidia.com>
Date: Thu, 13 Nov 2025 19:20:30 -0800
Subject: [PATCH 1/3] export kv cache type

Signed-off-by: jenchen13 <jennifchen@nvidia.com>
---
 modelopt/torch/export/unified_export_megatron.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 70a80aeec..b3f8d6c36 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -326,7 +326,6 @@ def save_pretrained(
         state_dict = self.extra_state_dict if self.export_extra_modules else self.state_dict
         quantization_format = get_quantization_format(self.model)
         quantization = None
-        kv_cache_quantization = None
 
         if quantization_format in (
             QUANTIZATION_FP8_PB_REAL,
@@ -338,6 +337,10 @@ def save_pretrained(
         elif quantization_format == QUANTIZATION_NVFP4:
             quantization = "NVFP4"
 
+        kv_cache_quantization = None
+        if get_kv_cache_dtype(self.model) == KV_CACHE_FP8:
+            # Only FP8 KV Cache is supported in VLLM for now
+            kv_cache_quantization = "FP8"
         # We use the last PP rank and the 1st EP rank to write the config because
         # medusa_heads and eagle_module only exist in the last stage.
         if is_last_stage_main_rank:

From 44a52675b54c24447d91f07736aee7ecbeaf4916 Mon Sep 17 00:00:00 2001
From: jenchen13 <jennifchen@nvidia.com>
Date: Fri, 14 Nov 2025 09:29:27 -0800
Subject: [PATCH 2/3] add nvfp4

Signed-off-by: jenchen13 <jennifchen@nvidia.com>
---
 modelopt/torch/export/unified_export_megatron.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index b3f8d6c36..07ea44619 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -38,6 +38,7 @@
 
 from .model_config import (
     KV_CACHE_FP8,
+    KV_CACHE_NVFP4,
     QUANTIZATION_FP8,
     QUANTIZATION_FP8_PB_REAL,
     QUANTIZATION_FP8_PB_WO,
@@ -338,9 +339,10 @@ def save_pretrained(
             quantization = "NVFP4"
 
         kv_cache_quantization = None
-        if get_kv_cache_dtype(self.model) == KV_CACHE_FP8:
+        kv_cache_dtype = get_kv_cache_dtype(self.model)
+        if  kv_cache_dtype == KV_CACHE_FP8 or kv_cache_dtype == KV_CACHE_NVFP4:
             # Only FP8 KV Cache is supported in VLLM for now
-            kv_cache_quantization = "FP8"
+            kv_cache_quantization = kv_cache_dtype
         # We use the last PP rank and the 1st EP rank to write the config because
         # medusa_heads and eagle_module only exist in the last stage.
         if is_last_stage_main_rank:

From 247bc54d1d76bd9567d2287c6953697180c8803c Mon Sep 17 00:00:00 2001
From: Jennifer Chen <jennifchen@nvidia.com>
Date: Fri, 14 Nov 2025 17:39:20 +0000
Subject: [PATCH 3/3] lint

Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
---
 modelopt/torch/export/unified_export_megatron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
index 07ea44619..137a6a8d0 100644
--- a/modelopt/torch/export/unified_export_megatron.py
+++ b/modelopt/torch/export/unified_export_megatron.py
@@ -340,7 +340,7 @@ def save_pretrained(
 
         kv_cache_quantization = None
         kv_cache_dtype = get_kv_cache_dtype(self.model)
-        if  kv_cache_dtype == KV_CACHE_FP8 or kv_cache_dtype == KV_CACHE_NVFP4:
+        if kv_cache_dtype in (KV_CACHE_FP8, KV_CACHE_NVFP4):
             # Only FP8 KV Cache is supported in VLLM for now
             kv_cache_quantization = kv_cache_dtype
         # We use the last PP rank and the 1st EP rank to write the config because