st publish mode only load weight (#5116)

EddyLXJ · meta-codesync[bot] · commit f3d282bd3b96 · 2025-11-14T22:17:46.000-08:00
Summary: Pull Request resolved: #5116 X-link: meta-pytorch/torchrec#3538 X-link: https://github.com/facebookresearch/FBGEMM/pull/2122 For silvertorch publish, we don't want to load opt into backend due to limited cpu memory in publish host. So we need to load the whole row into state dict which loading the checkpoint in st publish, then only save weight into backend, after that backend will only have metaheader + weight. For the first loading, we need to set dim with metaheader_dim + emb_dim + optimizer_state_dim, otherwise the checkpoint loadding will throw size mismatch error. after the first loading, we only need to get metaheader+weight from backend for state dict, so we can set dim with metaheader_dim + emb Reviewed By: emlin Differential Revision: D85830053 fbshipit-source-id: 0eddbe9e69ea8271e8c77dc0147e87a08f0b3934
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_common.py
@@ -248,6 +248,7 @@ class KVZCHParams(NamedTuple):
     backend_return_whole_row: bool = False
     eviction_policy: EvictionPolicy = EvictionPolicy()
     embedding_cache_mode: bool = False
+    load_ckpt_without_opt: bool = False
 
     def validate(self) -> None:
         assert len(self.bucket_offsets) == len(self.bucket_sizes), (
@@ -271,6 +272,8 @@ class KVZCHTBEConfig(NamedTuple):
     threshold_calculation_bucket_stride: float = 0.2
     # Total number of feature score buckets used for threshold calculation in feature score-based eviction.
     threshold_calculation_bucket_num: Optional[int] = 1000000  # 1M
+    # When true, we only save weight to kvzch backend and not optimizer state.
+    load_ckpt_without_opt: bool = False
 
 
 class BackendType(enum.IntEnum):
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -217,8 +217,13 @@ def __init__(
         self.enable_optimizer_offloading: bool = False
         self.backend_return_whole_row: bool = False
         self._embedding_cache_mode: bool = False
+        self.load_ckpt_without_opt: bool = False
         if self.kv_zch_params:
             self.kv_zch_params.validate()
+            self.load_ckpt_without_opt = (
+                # pyre-ignore [16]
+                self.kv_zch_params.load_ckpt_without_opt
+            )
             self.enable_optimizer_offloading = (
                 # pyre-ignore [16]
                 self.kv_zch_params.enable_optimizer_offloading
@@ -1105,7 +1110,9 @@ def cache_row_dim(self) -> int:
         padding to the nearest 4 elements and the optimizer state appended to
         the back of the row
         """
-        if self.enable_optimizer_offloading:
+
+        # For st publish, we only need to load weight for publishing and bulk eval
+        if self.enable_optimizer_offloading and not self.load_ckpt_without_opt:
             return self.max_D + pad4(
                 # Compute the number of elements of cache_dtype needed to store
                 # the optimizer state
@@ -3092,6 +3099,38 @@ def _may_create_snapshot_for_state_dict(
             self.flush(force=should_flush)
         return snapshot_handle, checkpoint_handle
 
+    def get_embedding_dim_for_kvt(
+        self, metaheader_dim: int, emb_dim: int, is_loading_checkpoint: bool
+    ) -> int:
+        if self.load_ckpt_without_opt:
+            # For silvertorch publish, we don't want to load opt into backend due to limited cpu memory in publish host.
+            # So we need to load the whole row into state dict which loading the checkpoint in st publish, then only save weight into backend, after that
+            # backend will only have metaheader + weight.
+            # For the first loading, we need to set dim with metaheader_dim + emb_dim + optimizer_state_dim, otherwise the checkpoint loadding will throw size mismatch error
+            # after the first loading, we only need to get metaheader+weight from backend for state dict, so we can set dim with metaheader_dim + emb
+            if is_loading_checkpoint:
+                return (
+                    (
+                        metaheader_dim  # metaheader is already padded
+                        + pad4(emb_dim)
+                        + pad4(self.optimizer_state_dim)
+                    )
+                    if self.backend_return_whole_row
+                    else emb_dim
+                )
+            else:
+                return metaheader_dim + pad4(emb_dim)
+        else:
+            return (
+                (
+                    metaheader_dim  # metaheader is already padded
+                    + pad4(emb_dim)
+                    + pad4(self.optimizer_state_dim)
+                )
+                if self.backend_return_whole_row
+                else emb_dim
+            )
+
     @torch.jit.export
     def split_embedding_weights(
         self,
@@ -3149,6 +3188,7 @@ def split_embedding_weights(
 
         table_offset = 0
         for i, (emb_height, emb_dim) in enumerate(self.embedding_specs):
+            is_loading_checkpoint = False
             bucket_ascending_id_tensor = None
             bucket_t = None
             metadata_tensor = None
@@ -3214,6 +3254,7 @@ def split_embedding_weights(
                             dtype=torch.int64,
                         )
                     skip_metadata = True
+                    is_loading_checkpoint = True
 
                     # self.local_weight_counts[i] = 0  # Reset the count
 
@@ -3238,14 +3279,8 @@ def split_embedding_weights(
                         if bucket_ascending_id_tensor is not None
                         else emb_height
                     ),
-                    (
-                        (
-                            metaheader_dim  # metaheader is already padded
-                            + pad4(emb_dim)
-                            + pad4(self.optimizer_state_dim)
-                        )
-                        if self.backend_return_whole_row
-                        else emb_dim
+                    self.get_embedding_dim_for_kvt(
+                        metaheader_dim, emb_dim, is_loading_checkpoint
                     ),
                 ],
                 dtype=dtype,
@@ -3257,6 +3292,11 @@ def split_embedding_weights(
                     bucket_ascending_id_tensor if self.kv_zch_params else None
                 ),
                 checkpoint_handle=checkpoint_handle,
+                only_load_weight=(
+                    True
+                    if self.load_ckpt_without_opt and is_loading_checkpoint
+                    else False
+                ),
             )
             (
                 tensor_wrapper.set_embedding_rocks_dp_wrapper(self.ssd_db)
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper.h
@@ -65,7 +65,8 @@ class KVTensorWrapper : public torch::jit::CustomClassHolder {
       int64_t width_offset = 0,
       const std::optional<c10::intrusive_ptr<RocksdbCheckpointHandleWrapper>>
           checkpoint_handle = std::nullopt,
-      bool read_only = false);
+      bool read_only = false,
+      bool only_load_weight = false);
 
   explicit KVTensorWrapper(const std::string& serialized);
 
@@ -153,6 +154,7 @@ class KVTensorWrapper : public torch::jit::CustomClassHolder {
   int64_t max_D{};
   std::string checkpoint_uuid;
   bool read_only_{};
+  bool only_load_weight_{};
 };
 
 void to_json(json& j, const KVTensorWrapper& kvt);
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper_cpu.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_tensor_wrapper_cpu.cpp
@@ -38,7 +38,8 @@ KVTensorWrapper::KVTensorWrapper(
     [[maybe_unused]] int64_t width_offset,
     [[maybe_unused]] const std::optional<
         c10::intrusive_ptr<RocksdbCheckpointHandleWrapper>>,
-    [[maybe_unused]] bool read_only)
+    [[maybe_unused]] bool read_only,
+    [[maybe_unused]] bool only_load_weight)
     // @lint-ignore CLANGTIDY clang-diagnostic-missing-noreturn
     : shape_(std::move(shape)), row_offset_(row_offset) {
   FBEXCEPTION("Not implemented");
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
@@ -374,12 +374,14 @@ KVTensorWrapper::KVTensorWrapper(
     int64_t width_offset_,
     const std::optional<c10::intrusive_ptr<RocksdbCheckpointHandleWrapper>>
         checkpoint_handle,
-    bool read_only)
+    bool read_only,
+    bool only_load_weight)
     : db_(nullptr),
       shape_(std::move(shape)),
       row_offset_(row_offset),
       width_offset_(width_offset_),
-      read_only_(read_only) {
+      read_only_(read_only),
+      only_load_weight_(only_load_weight) {
   CHECK_GE(width_offset_, 0);
   CHECK_EQ(shape_.size(), 2) << "Only 2D emb tensors are supported";
   options_ = at::TensorOptions()
@@ -558,7 +560,10 @@ void KVTensorWrapper::set_range(
   CHECK(db_) << "EmbeddingRocksDB must be a valid pointer to call set_range";
   CHECK_EQ(dim, 0) << "Only set_range on dim 0 is supported";
   CHECK_TRUE(db_ != nullptr);
-  CHECK_GE(db_->get_max_D() + db_->get_metaheader_width_in_front(), shape_[1]);
+  if (!only_load_weight_) {
+    CHECK_GE(
+        db_->get_max_D() + db_->get_metaheader_width_in_front(), shape_[1]);
+  }
 
   if (db_->get_backend_return_whole_row()) {
     // backend returns whole row, so we need to replace the first 8 bytes with
@@ -576,6 +581,10 @@ void KVTensorWrapper::set_range(
       db_->get_max_D() + db_->get_metaheader_width_in_front() - weights.size(1);
   if (pad_right == 0) {
     db_->set_range_to_storage(weights, start + row_offset_, length);
+  } else if (pad_right < 0 && only_load_weight_) {
+    int64_t cut_dim = db_->get_max_D() + db_->get_metaheader_width_in_front();
+    at::Tensor new_weights = weights.narrow(1, 0, cut_dim).contiguous();
+    db_->set_range_to_storage(new_weights, start + row_offset_, length);
   } else {
     std::vector<int64_t> padding = {0, pad_right, 0, 0};
     auto padded_weights = torch::constant_pad_nd(weights, padding, 0);
@@ -1080,6 +1089,7 @@ static auto kv_tensor_wrapper =
                 int64_t,
                 std::optional<
                     c10::intrusive_ptr<RocksdbCheckpointHandleWrapper>>,
+                bool,
                 bool>(),
             "",
             {torch::arg("shape"),
@@ -1091,7 +1101,8 @@ static auto kv_tensor_wrapper =
              torch::arg("sorted_indices") = std::nullopt,
              torch::arg("width_offset") = 0,
              torch::arg("checkpoint_handle") = std::nullopt,
-             torch::arg("read_only") = false})
+             torch::arg("read_only") = false,
+             torch::arg("only_load_weight") = false})
         .def(
             "set_embedding_rocks_dp_wrapper",
             &KVTensorWrapper::set_embedding_rocks_dp_wrapper,