diff --git a/c/include/cuvs/neighbors/hnsw.h b/c/include/cuvs/neighbors/hnsw.h
index 15eb1b0569..9f4c1c4445 100644
--- a/c/include/cuvs/neighbors/hnsw.h
+++ b/c/include/cuvs/neighbors/hnsw.h
@@ -35,7 +35,9 @@ enum cuvsHnswHierarchy {
   /* Full hierarchy is built using the CPU */
   CPU = 1,
   /* Full hierarchy is built using the GPU */
-  GPU = 2
+  GPU = 2,
+  /* GPU-built hierarchy stored as a layered on-disk topology artifact */
+  GPU_LAYERED_ON_DISK = 3
 };
 
 /**
@@ -130,6 +132,14 @@ struct cuvsHnswIndexParams {
    * Set to nullptr for default behavior (from_cagra conversion).
    */
   cuvsHnswAceParams_t ace_params;
+  /**
+   * Local dataset path used by layered HNSW deserialization.
+   *
+   * When `hierarchy == GPU_LAYERED_ON_DISK`, the index artifact stores graph topology only.
+   * `cuvsHnswDeserialize` loads the original-ID-ordered vectors from this local dataset path to
+   * reconstruct an in-memory HNSW index. Set to nullptr (default) for all other hierarchies.
+   */
+  const char* dataset_path;
 };
 
 typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t;
@@ -601,6 +611,80 @@ CUVS_EXPORT cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
  * @}
  */
 
+/**
+ * @defgroup hnsw_c_index_materialize Materialize a layered HNSW artifact to an hnswlib index
+ * @{
+ */
+
+/**
+ * @brief Parameters for materializing a layered HNSW artifact into an hnswlib index on disk.
+ */
+struct cuvsHnswMaterializeParams {
+  /**
+   * Local dataset path holding the original-ID-ordered vectors used to build the artifact.
+   *
+   * Supported formats match layered deserialization: `.npy` and ANN benchmark `*.bin` files with a
+   * `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`, `.i8bin`).
+   */
+  const char* dataset_path;
+  /**
+   * Upper bound on host memory (in GiB) used for the base-topology reorder buffer.
+   *
+   * When `<= 0`, the whole base topology is reordered in a single in-memory pass (no temporary
+   * files). When set, the base topology is reordered through bucketed temporary files so that peak
+   * host memory stays close to this budget.
+   */
+  double max_host_memory_gb;
+  /** Number of host threads to use. When `0`, the maximum number of threads is used. */
+  int num_threads;
+};
+
+typedef struct cuvsHnswMaterializeParams* cuvsHnswMaterializeParams_t;
+
+/**
+ * @brief Allocate HNSW materialize params, and populate with default values
+ *
+ * @param[in] params cuvsHnswMaterializeParams_t to allocate
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsHnswMaterializeParamsCreate(cuvsHnswMaterializeParams_t* params);
+
+/**
+ * @brief De-allocate HNSW materialize params
+ *
+ * @param[in] params cuvsHnswMaterializeParams_t to de-allocate
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsHnswMaterializeParamsDestroy(cuvsHnswMaterializeParams_t params);
+
+/**
+ * @brief Materialize a layered HNSW artifact into a standard hnswlib index file on disk.
+ *
+ * Materializes a `GPU_LAYERED_ON_DISK` artifact (graph topology only, stored in ACE order) plus a
+ * local dataset into a standard hnswlib index file, without ever holding the full materialized
+ * index in host memory. The resulting file is compatible with the original hnswlib library and can
+ * be read back through `cuvsHnswDeserialize` with `hierarchy == CPU`. The element data type
+ * (`float`, `half`, `uint8_t` or `int8_t`) is read from the artifact header.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswMaterializeParams_t materialization parameters
+ * @param[in] layered_artifact_path path to the layered HNSW artifact
+ * @param[in] output_path path to the hnswlib index file to write
+ * @param[in] dim the dimension of the vectors in the index
+ * @param[in] metric the distance metric used to build the index
+ * @return cuvsError_t
+ */
+CUVS_EXPORT cuvsError_t cuvsHnswMaterializeToHnswlib(cuvsResources_t res,
+                                                     cuvsHnswMaterializeParams_t params,
+                                                     const char* layered_artifact_path,
+                                                     const char* output_path,
+                                                     int dim,
+                                                     cuvsDistanceType metric);
+
+/**
+ * @}
+ */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/src/neighbors/hnsw.cpp b/c/src/neighbors/hnsw.cpp
index c69eda0ca0..c791c004a3 100644
--- a/c/src/neighbors/hnsw.cpp
+++ b/c/src/neighbors/hnsw.cpp
@@ -143,6 +143,8 @@ void* _deserialize(cuvsResources_t res,
   cuvs::neighbors::hnsw::index<T>* index = nullptr;
   auto cpp_params                        = cuvs::neighbors::hnsw::index_params();
   cpp_params.hierarchy = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  // Required by GPU_LAYERED_ON_DISK deserialization to locate the original-ID-ordered vectors.
+  if (params->dataset_path != nullptr) { cpp_params.dataset_path = std::string(params->dataset_path); }
   auto metric_type     = static_cast<cuvs::distance::DistanceType>(metric);
   cuvs::neighbors::hnsw::deserialize(
     *res_ptr, cpp_params, std::string(filename), dim, metric_type, &index);
@@ -174,7 +176,8 @@ extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
                                       .num_threads     = 0,
                                       .M               = 32,
                                       .metric          = L2Expanded,
-                                      .ace_params      = nullptr};
+                                      .ace_params      = nullptr,
+                                      .dataset_path    = nullptr};
   });
 }
 
@@ -404,3 +407,41 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
     }
   });
 }
+
+extern "C" cuvsError_t cuvsHnswMaterializeParamsCreate(cuvsHnswMaterializeParams_t* params)
+{
+  return cuvs::core::translate_exceptions([=] {
+    *params = new cuvsHnswMaterializeParams{
+      .dataset_path = nullptr, .max_host_memory_gb = 0, .num_threads = 0};
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswMaterializeParamsDestroy(cuvsHnswMaterializeParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsHnswMaterializeToHnswlib(cuvsResources_t res,
+                                                    cuvsHnswMaterializeParams_t params,
+                                                    const char* layered_artifact_path,
+                                                    const char* output_path,
+                                                    int dim,
+                                                    cuvsDistanceType metric)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto res_ptr    = reinterpret_cast<raft::resources*>(res);
+    auto cpp_params = cuvs::neighbors::hnsw::materialize_params();
+    if (params->dataset_path != nullptr) {
+      cpp_params.dataset_path = std::string(params->dataset_path);
+    }
+    cpp_params.max_host_memory_gb = params->max_host_memory_gb;
+    cpp_params.num_threads        = params->num_threads;
+    auto metric_type              = static_cast<cuvs::distance::DistanceType>(metric);
+    cuvs::neighbors::hnsw::materialize_to_hnswlib(*res_ptr,
+                                                  cpp_params,
+                                                  std::string(layered_artifact_path),
+                                                  std::string(output_path),
+                                                  dim,
+                                                  metric_type);
+  });
+}
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 26028b6d98..65adc926f8 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -1,9 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../common/ann_types.hpp"
+#include "../common/conf.hpp"
 #include "cuvs_ann_bench_param_parser.h"
 #include "cuvs_cagra_hnswlib_wrapper.h"
 
@@ -27,6 +28,8 @@ auto parse_build_param(const nlohmann::json& conf) ->
       hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
     } else if (conf.at("hierarchy") == "gpu") {
       hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU;
+    } else if (conf.at("hierarchy") == "gpu_layered_on_disk") {
+      hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
     } else {
       THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
     }
@@ -36,6 +39,11 @@ auto parse_build_param(const nlohmann::json& conf) ->
   if (conf.contains("ef_construction")) {
     hnsw_params.ef_construction = conf.at("ef_construction");
   }
+  if (conf.contains("dataset_path")) {
+    hnsw_params.dataset_path = conf.at("dataset_path");
+  } else if (hnsw_params.hierarchy == cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    hnsw_params.dataset_path = configuration::singleton().get_dataset_conf().base_file;
+  }
   if (conf.contains("num_threads")) { hnsw_params.num_threads = conf.at("num_threads"); }
 
   // Reuse the CAGRA wrapper params parser
@@ -55,16 +63,18 @@ auto parse_build_param(const nlohmann::json& conf) ->
         cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT,
         dist_type);
       ps.metric = dist_type;
-      // Parse ACE parameters if provided
-      if (conf.contains("npartitions") || conf.contains("build_dir") ||
-          conf.contains("ef_construction") || conf.contains("use_disk")) {
+      // Parse ACE parameters if provided.
+      auto ace_conf = collect_conf_with_prefix(conf, "ace_");
+      if (!ace_conf.empty()) {
         auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params();
-        if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); }
-        if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); }
-        if (conf.contains("ef_construction")) {
-          ace_params.ef_construction = conf.at("ef_construction");
+        if (ace_conf.contains("npartitions")) {
+          ace_params.npartitions = ace_conf.at("npartitions");
+        }
+        if (ace_conf.contains("build_dir")) { ace_params.build_dir = ace_conf.at("build_dir"); }
+        if (ace_conf.contains("ef_construction")) {
+          ace_params.ef_construction = ace_conf.at("ef_construction");
         }
-        if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); }
+        if (ace_conf.contains("use_disk")) { ace_params.use_disk = ace_conf.at("use_disk"); }
         ps.graph_build_params = ace_params;
       }
       // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual.
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 2f0c54e1bd..8eb1b72991 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,10 +9,33 @@
 #include <raft/core/logger.hpp>
 
 #include <chrono>
+#include <filesystem>
 #include <memory>
 
 namespace cuvs::bench {
 
+inline void move_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst)
+{
+  std::error_code ec;
+  if (src == dst ||
+      (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) {
+    return;
+  }
+  if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); }
+  if (std::filesystem::exists(dst, ec)) { std::filesystem::remove(dst, ec); }
+
+  std::filesystem::rename(src, dst, ec);
+  if (ec) {
+    // Rename fails across filesystems. Fall back to copy followed by removal of the source.
+    ec.clear();
+    std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing, ec);
+    const auto src_str = src.string();
+    const auto dst_str = dst.string();
+    RAFT_EXPECTS(!ec, "Failed to move '%s' to '%s'.", src_str.c_str(), dst_str.c_str());
+    std::filesystem::remove(src, ec);
+  }
+}
+
 template <typename T, typename IdxT>
 class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
  public:
@@ -130,18 +153,25 @@ void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& para
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
+  if (build_param_.hnsw_index_params.hierarchy ==
+      cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    const auto src_artifact = std::filesystem::path(hnsw_index_->file_path());
+    RAFT_EXPECTS(!src_artifact.empty(), "Layered HNSW artifact path is not available.");
+    RAFT_EXPECTS(std::filesystem::exists(src_artifact),
+                 "Layered HNSW artifact '%s' does not exist.",
+                 src_artifact.c_str());
+
+    move_file_overwrite(src_artifact, std::filesystem::path(file));
+    return;
+  }
+
   if (cagra_ace_build_) {
     std::string index_filename = hnsw_index_->file_path();
     RAFT_EXPECTS(!index_filename.empty(), "HNSW index file path is not available.");
     RAFT_EXPECTS(std::filesystem::exists(index_filename),
                  "Index file '%s' does not exist.",
                  index_filename.c_str());
-    if (std::filesystem::exists(file)) { std::filesystem::remove(file); }
-    // might fail when using 2 different filesystems
-    std::error_code ec;
-    std::filesystem::rename(index_filename, file, ec);
-    RAFT_EXPECTS(
-      !ec, "Failed to rename index file '%s' to '%s'.", index_filename.c_str(), file.c_str());
+    move_file_overwrite(std::filesystem::path(index_filename), std::filesystem::path(file));
   } else {
     cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
   }
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index fb726fed71..bc5a58972a 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <cuvs/core/export.hpp>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <variant>
 
@@ -41,9 +42,10 @@ namespace graph_build_params = cuvs::neighbors::graph_build_params;
  * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
  */
 enum class HnswHierarchy {
-  NONE,  // base-layer-only index
-  CPU,   // full index with CPU-built hierarchy
-  GPU    // full index with GPU-built hierarchy
+  NONE,                // base-layer-only index
+  CPU,                 // full index with CPU-built hierarchy
+  GPU,                 // full index with GPU-built hierarchy
+  GPU_LAYERED_ON_DISK  // GPU-built hierarchy stored as layered on-disk topology
 };
 
 struct index_params : cuvs::neighbors::index_params {
@@ -64,6 +66,16 @@ struct index_params : cuvs::neighbors::index_params {
    */
   size_t M = 32;
 
+  /** Local dataset path used by layered HNSW deserialization.
+   *
+   * When `hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK`, the index artifact stores graph
+   * topology only. `deserialize` loads vectors from this local dataset path to reconstruct an
+   * in-memory HNSW index.
+   * Currently supported local dataset formats are `.npy` and ANN benchmark `*.bin` files with a
+   * `[uint32 rows, uint32 cols]` header.
+   */
+  std::string dataset_path;
+
   /** Parameters to fine tune GPU graph building. By default we select the parameters based on
    * dataset shape and HNSW build parameters. You can override these parameters to fine tune the
    * graph building process as described in the CAGRA build docs.
@@ -1234,6 +1246,75 @@ void deserialize(raft::resources const& res,
  * @}
  */
 
+/**
+ * @defgroup hnsw_cpp_index_materialize Materialize a layered HNSW artifact into an hnswlib index
+ * @{
+ */
+
+/**
+ * @brief Parameters for materializing a layered HNSW artifact into an hnswlib index on disk.
+ */
+struct materialize_params {
+  /** Local dataset path holding the original-ID-ordered vectors used to build the artifact.
+   *
+   * Supported formats match layered deserialization: `.npy` and ANN benchmark `*.bin` files with a
+   * `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`, `.i8bin`).
+   */
+  std::string dataset_path;
+
+  /** Upper bound on host memory (in GiB) used for the base-topology reorder buffer.
+   *
+   * When `<= 0`, the whole base topology is reordered in a single in-memory pass (no temporary
+   * files). When set, the base topology is reordered through bucketed temporary files so that
+   * peak host memory stays close to this budget, at the cost of writing and re-reading the
+   * (small) base-topology section once.
+   */
+  double max_host_memory_gb = 0;
+
+  /** Number of host threads to use. When `0`, the maximum number of threads is used. */
+  int num_threads = 0;
+};
+
+/**
+ * @brief Materialize a layered HNSW artifact into a standard hnswlib index file on disk.
+ *
+ * Materializes a `GPU_LAYERED_ON_DISK` artifact (graph topology only, stored in ACE order) plus a
+ * local dataset into a standard hnswlib index file, without ever holding the full materialized
+ * index in host memory. The materialization reorders the base topology from ACE order to
+ * original-id order and interleaves the vectors, emitting the output with sequential disk IO. The
+ * resulting file is compatible with the original hnswlib library (`loadIndex`) and can be read back
+ * through `cuvs::neighbors::hnsw::deserialize` with `hierarchy == HnswHierarchy::CPU`.
+ *
+ * The element data type (`float`, `half`, `uint8_t` or `int8_t`) is read from the artifact header,
+ * so a single entry point covers all supported dtypes.
+ *
+ * @param[in] res raft resources
+ * @param[in] params materialization parameters (dataset path, host-memory budget, threads)
+ * @param[in] layered_artifact_path path to the layered HNSW artifact
+ * @param[in] output_path path to the hnswlib index file to write
+ * @param[in] dim dimensions of the training dataset
+ * @param[in] metric distance metric. Supported metrics ("L2Expanded", "InnerProduct")
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   hnsw::materialize_params materialize_params;
+ *   materialize_params.dataset_path = "dataset.fbin";
+ *   hnsw::materialize_to_hnswlib(
+ *     res, materialize_params, "layered_artifact.cuvs", "index.bin", dim, metric);
+ * @endcode
+ */
+void materialize_to_hnswlib(raft::resources const& res,
+                            const materialize_params& params,
+                            const std::string& layered_artifact_path,
+                            const std::string& output_path,
+                            int dim,
+                            cuvs::distance::DistanceType metric);
+
+/**
+ * @}
+ */
+
 }  // namespace hnsw
 }  // namespace neighbors
 }  // namespace CUVS_EXPORT cuvs
diff --git a/cpp/include/cuvs/util/file_io.hpp b/cpp/include/cuvs/util/file_io.hpp
index f02e9c2d7d..52d10a53e7 100644
--- a/cpp/include/cuvs/util/file_io.hpp
+++ b/cpp/include/cuvs/util/file_io.hpp
@@ -254,6 +254,19 @@ void write_large_file(const file_descriptor& fd,
                       const size_t total_bytes,
                       const uint64_t file_offset);
 
+/**
+ * @brief Pre-size a file to `total_bytes` bytes.
+ *
+ * Prefers posix_fallocate (reserves blocks up-front, avoids later ENOSPC and fragmentation), but
+ * falls back to ftruncate on filesystems that do not support preallocation (tmpfs and some
+ * NFS/overlay mounts return EOPNOTSUPP/EINVAL/ENOSYS) so the operation still succeeds there. A
+ * `total_bytes` of 0 is a no-op. Throws on failure (uses the descriptor's path in the message).
+ *
+ * @param fd File descriptor to pre-size
+ * @param total_bytes Target file size in bytes
+ */
+void preallocate_file(const file_descriptor& fd, const size_t total_bytes);
+
 /**
  * @brief Buffered output stream wrapper
  *
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index 96ff8344d3..3edbec46af 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -51,6 +51,13 @@ namespace cuvs::neighbors::cagra::detail {
 constexpr double to_mib(size_t bytes) { return static_cast<double>(bytes) / (1 << 20); }
 constexpr double to_gib(size_t bytes) { return static_cast<double>(bytes) / (1 << 30); }
 
+inline auto progress_step_10(size_t completed, size_t total) -> size_t
+{
+  if (total == 0) { return 100; }
+  const auto percent = (std::min(completed, total) * 100) / total;
+  return completed >= total ? 100 : (percent / 10) * 10;
+}
+
 template <typename T, typename IdxT>
 void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size)
 {
@@ -99,7 +106,7 @@ void ace_get_partition_labels(
   const size_t min_samples = 100 * n_partitions;
   n_samples                = std::max(n_samples, min_samples);
   n_samples                = std::min(n_samples, dataset_size);
-  RAFT_LOG_DEBUG("ACE: n_samples: %lu", n_samples);
+  RAFT_LOG_DEBUG("ACE build: partition labeling uses %lu sampled vectors", n_samples);
 
   auto sample_db = raft::make_host_matrix<float, int64_t>(n_samples, dataset_dim);
 #pragma omp parallel for
@@ -123,18 +130,10 @@ void ace_get_partition_labels(
   auto _sub_distances     = raft::make_host_matrix<float, int64_t>(chunk_size, n_partitions);
   auto _sub_dataset_dev   = raft::make_device_matrix<float, int64_t>(res, chunk_size, dataset_dim);
   auto _sub_distances_dev = raft::make_device_matrix<float, int64_t>(res, chunk_size, n_partitions);
-  size_t report_interval  = dataset_size / 10;
-  report_interval         = (report_interval / chunk_size) * chunk_size;
-  report_interval         = std::max(report_interval, chunk_size);
+  size_t next_progress_percent = 10;
 
   for (size_t i_base = 0; i_base < dataset_size; i_base += chunk_size) {
     const size_t sub_dataset_size = std::min(chunk_size, dataset_size - i_base);
-    if (i_base % report_interval == 0) {
-      RAFT_LOG_INFO("ACE: Processing chunk %lu / %lu (%.1f%%)",
-                    i_base,
-                    dataset_size,
-                    static_cast<double>(100 * i_base) / dataset_size);
-    }
 
     auto sub_dataset = raft::make_host_matrix_view<float, int64_t>(
       _sub_dataset.data_handle(), sub_dataset_size, dataset_dim);
@@ -191,6 +190,16 @@ void ace_get_partition_labels(
 #pragma omp atomic update
       partition_histogram(augmented_label, 1) += 1;
     }
+
+    const auto completed_rows = i_base + sub_dataset_size;
+    const auto report_percent = progress_step_10(completed_rows, dataset_size);
+    if (report_percent >= next_progress_percent) {
+      RAFT_LOG_INFO("ACE build: partition labeling progress %zu%% (%zu/%zu rows)",
+                    report_percent,
+                    completed_rows,
+                    dataset_size);
+      next_progress_percent = report_percent + 10;
+    }
   }
 }
 
@@ -261,15 +270,15 @@ void ace_check_partition_sizes(
 
     if (total_count > 0 && total_count < very_small_threshold) {
       RAFT_LOG_WARN(
-        "ACE: Partition %lu is very small (%lu vectors, expected ~%.1f). This may affect graph "
-        "quality.",
+        "ACE build: partition %lu is very small (%lu vectors, expected ~%.1f); graph quality may "
+        "be affected",
         c,
         total_count,
         expected_avg_vectors);
     } else if (total_count > very_large_threshold) {
       RAFT_LOG_WARN(
-        "ACE: Partition %lu is very large (%lu vectors, expected ~%.1f, threshold: %lu). This may "
-        "indicate imbalance and can lead to memory issues in restricted environments.",
+        "ACE build: partition %lu is very large (%lu vectors, expected ~%.1f, threshold=%lu); "
+        "partition imbalance may increase memory pressure",
         c,
         total_count,
         expected_avg_vectors,
@@ -482,11 +491,10 @@ void ace_reorder_and_store_dataset(
   size_t dataset_dim  = dataset.extent(1);
   size_t n_partitions = partition_histogram.extent(0);
 
-  RAFT_LOG_DEBUG(
-    "ACE: Reordering and storing dataset to disk (%lu vectors, %lu dimensions, %lu partitions)",
-    dataset_size,
-    dataset_dim,
-    n_partitions);
+  RAFT_LOG_DEBUG("ACE build: reordering dataset to disk (rows=%lu dim=%lu partitions=%lu)",
+                 dataset_size,
+                 dataset_dim,
+                 n_partitions);
 
   // Calculate total sizes for pre-allocation
   size_t total_core_vectors      = 0;
@@ -509,10 +517,10 @@ void ace_reorder_and_store_dataset(
   size_t reordered_file_size = total_core_vectors * vector_size;
   size_t augmented_file_size = total_augmented_vectors * vector_size;
 
-  RAFT_LOG_DEBUG("ACE: Reordered dataset: %lu core vectors (%.2f GiB)",
+  RAFT_LOG_DEBUG("ACE build: reordered dataset section rows=%lu size=%.2f GiB",
                  total_core_vectors,
                  reordered_file_size / (1024.0 * 1024.0 * 1024.0));
-  RAFT_LOG_DEBUG("ACE: Augmented dataset: %lu secondary vectors (%.2f GiB)",
+  RAFT_LOG_DEBUG("ACE build: augmented dataset section rows=%lu size=%.2f GiB",
                  total_augmented_vectors,
                  augmented_file_size / (1024.0 * 1024.0 * 1024.0));
 
@@ -541,7 +549,7 @@ void ace_reorder_and_store_dataset(
   disk_write_size           = std::min<size_t>(disk_write_size, 64 * 1024 * 1024);
   size_t vectors_per_buffer = std::max<size_t>(64, disk_write_size / vector_size);
 
-  RAFT_LOG_DEBUG("ACE: Reorder buffers: %lu vectors per buffer (%.2f MiB)",
+  RAFT_LOG_DEBUG("ACE build: reorder buffer rows=%lu size=%.2f MiB",
                  vectors_per_buffer,
                  to_mib(vectors_per_buffer * vector_size));
 
@@ -593,8 +601,8 @@ void ace_reorder_and_store_dataset(
     }
   };
 
-  size_t vectors_processed  = 0;
-  const size_t log_interval = std::max(dataset_size / 10, size_t(1));
+  size_t vectors_processed     = 0;
+  size_t next_progress_percent = 10;
   for (size_t i = 0; i < dataset_size; i++) {
     size_t core_partition      = partition_labels(i, 0);
     size_t secondary_partition = partition_labels(i, 1);
@@ -623,16 +631,18 @@ void ace_reorder_and_store_dataset(
     }
 
     vectors_processed++;
-    if (vectors_processed % log_interval == 0) {
-      RAFT_LOG_INFO("ACE: Processed %lu/%lu vectors (%.1f%%)",
+    const auto report_percent = progress_step_10(vectors_processed, dataset_size);
+    if (report_percent >= next_progress_percent) {
+      RAFT_LOG_INFO("ACE build: dataset reorder progress %zu%% (%zu/%zu rows)",
+                    report_percent,
                     vectors_processed,
-                    dataset_size,
-                    100.0 * vectors_processed / dataset_size);
+                    dataset_size);
+      next_progress_percent = report_percent + 10;
     }
   }
 
   // Flush all remaining buffers
-  RAFT_LOG_DEBUG("ACE: Flushing remaining buffers...");
+  RAFT_LOG_DEBUG("ACE build: flushing remaining reorder buffers");
 #pragma omp parallel sections
   {
 #pragma omp section
@@ -662,12 +672,12 @@ void ace_reorder_and_store_dataset(
     elapsed_ms > 0 ? to_mib(total_bytes_written) / (elapsed_ms / 1000.0) : 0.0;
 
   RAFT_LOG_INFO(
-    "ACE: Dataset (%.2f GiB reordered, %.2f GiB augmented, %.2f GiB mapping) reordering completed "
-    "in %ld ms (%.1f MiB/s)",
+    "ACE build: dataset reorder completed in %ld ms (reordered=%.2f GiB augmented=%.2f GiB "
+    "mapping=%.2f GiB, %.1f MiB/s)",
+    elapsed_ms,
     reordered_file_size / (1024.0 * 1024.0 * 1024.0),
     augmented_file_size / (1024.0 * 1024.0 * 1024.0),
     mapping_file_size / (1024.0 * 1024.0 * 1024.0),
-    elapsed_ms,
     throughput_mb_s);
 }
 
@@ -685,17 +695,17 @@ void ace_load_partition_dataset_from_disk(
 {
   size_t n_partitions = partition_histogram.extent(0);
 
-  RAFT_LOG_DEBUG("ACE: Loading partition %lu dataset from disk", partition_id);
+  RAFT_LOG_DEBUG("ACE build: loading partition %lu dataset from disk", partition_id);
 
   size_t core_size            = partition_histogram(partition_id, 0);
   size_t augmented_size       = partition_histogram(partition_id, 1);
   size_t total_partition_size = core_size + augmented_size;
 
-  RAFT_LOG_DEBUG("ACE: Partition %lu: %lu core + %lu augmented = %lu total vectors",
+  RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)",
                  partition_id,
+                 total_partition_size,
                  core_size,
-                 augmented_size,
-                 total_partition_size);
+                 augmented_size);
 
   RAFT_EXPECTS(static_cast<size_t>(sub_dataset.extent(0)) == total_partition_size,
                "sub_dataset rows (%lu) must match total partition size (%lu)",
@@ -712,10 +722,10 @@ void ace_load_partition_dataset_from_disk(
   const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
 
   if (!std::filesystem::exists(reordered_dataset_path)) {
-    RAFT_FAIL("ACE: Required file does not exist: %s", reordered_dataset_path.c_str());
+    RAFT_FAIL("ACE build: required file does not exist: %s", reordered_dataset_path.c_str());
   }
   if (!std::filesystem::exists(augmented_dataset_path)) {
-    RAFT_FAIL("ACE: Required file does not exist: %s", augmented_dataset_path.c_str());
+    RAFT_FAIL("ACE build: required file does not exist: %s", augmented_dataset_path.c_str());
   }
 
   size_t core_header_size      = 0;
@@ -748,7 +758,7 @@ void ace_load_partition_dataset_from_disk(
   core_file_offset += core_header_size;
   augmented_file_offset += augmented_header_size;
 
-  RAFT_LOG_DEBUG("ACE: Core file offset: %lu bytes, Augmented file offset: %lu bytes",
+  RAFT_LOG_DEBUG("ACE build: core file offset=%lu bytes augmented file offset=%lu bytes",
                  core_file_offset,
                  augmented_file_offset);
 
@@ -763,7 +773,7 @@ void ace_load_partition_dataset_from_disk(
       try {
         if (core_size > 0) {
           RAFT_LOG_DEBUG(
-            "ACE: Reading %lu core vectors from offset %lu", core_size, core_file_offset);
+            "ACE build: reading %lu core vectors from offset %lu", core_size, core_file_offset);
           cuvs::util::file_descriptor reordered_fd(reordered_dataset_path, O_RDONLY);
           const size_t core_bytes = core_size * vector_size;
           cuvs::util::read_large_file(
@@ -777,7 +787,7 @@ void ace_load_partition_dataset_from_disk(
     {
       try {
         if (augmented_size > 0) {
-          RAFT_LOG_DEBUG("ACE: Reading %lu augmented vectors from offset %lu",
+          RAFT_LOG_DEBUG("ACE build: reading %lu augmented vectors from offset %lu",
                          augmented_size,
                          augmented_file_offset);
           cuvs::util::file_descriptor augmented_fd(augmented_dataset_path, O_RDONLY);
@@ -879,7 +889,8 @@ bool ace_check_use_disk_mode(bool use_disk,
   // Use overridden memory limits if provided (> 0), otherwise query actual system memory
   if (max_host_memory_gb.has_value() && max_host_memory_gb.value() > 0) {
     mem.available_host_memory = static_cast<size_t>(max_host_memory_gb.value() * (1ULL << 30));
-    RAFT_LOG_INFO("ACE: Using overridden host memory limit: %.2f GiB", max_host_memory_gb.value());
+    RAFT_LOG_INFO("ACE build: using overridden host memory limit %.2f GiB",
+                  max_host_memory_gb.value());
   } else {
     mem.available_host_memory = cuvs::util::get_free_host_memory();
   }
@@ -903,7 +914,7 @@ bool ace_check_use_disk_mode(bool use_disk,
   mem.total_size       = mem.partition_labels_size + mem.id_mapping_size + mem.sub_dataset_size +
                    mem.sub_graph_size + mem.cagra_graph_size;
 
-  RAFT_LOG_INFO("ACE: Estimated host memory required: %.2f GiB, available: %.2f GiB",
+  RAFT_LOG_INFO("ACE build: host memory estimate required=%.2f GiB available=%.2f GiB",
                 to_gib(mem.total_size),
                 to_gib(mem.available_host_memory));
 
@@ -915,7 +926,8 @@ bool ace_check_use_disk_mode(bool use_disk,
   // TODO: Extend model or use managed memory if running out of GPU memory.
   if (max_gpu_memory_gb.has_value() && max_gpu_memory_gb.value() > 0) {
     mem.available_gpu_memory = static_cast<size_t>(max_gpu_memory_gb.value() * (1ULL << 30));
-    RAFT_LOG_INFO("ACE: Using overridden GPU memory limit: %.2f GiB", max_gpu_memory_gb.value());
+    RAFT_LOG_INFO("ACE build: using overridden GPU memory limit %.2f GiB",
+                  max_gpu_memory_gb.value());
   } else {
     mem.available_gpu_memory = rmm::available_device_memory().second;
   }
@@ -923,7 +935,7 @@ bool ace_check_use_disk_mode(bool use_disk,
     static_cast<size_t>(usable_gpu_memory_fraction * mem.available_gpu_memory) <
     std::max(mem.sub_graph_size, mem.sub_dataset_size);
 
-  RAFT_LOG_INFO("ACE: Estimated GPU memory required: %.2f GiB, available: %.2f GiB",
+  RAFT_LOG_INFO("ACE build: GPU memory estimate required=%.2f GiB available=%.2f GiB",
                 to_gib(mem.cagra_graph_size),
                 to_gib(mem.available_gpu_memory));
 
@@ -934,7 +946,7 @@ bool ace_check_use_disk_mode(bool use_disk,
     valid_build_dir &= build_dir.find('\0') == std::string::npos;
     valid_build_dir &= build_dir.find("//") == std::string::npos;
     if (!valid_build_dir) {
-      RAFT_LOG_WARN("ACE: Invalid build_dir path, resetting to default: /tmp/ace_build");
+      RAFT_LOG_WARN("ACE build: invalid build_dir path, resetting to /tmp/ace_build");
       build_dir = "/tmp/ace_build";
     }
     if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) {
@@ -943,24 +955,19 @@ bool ace_check_use_disk_mode(bool use_disk,
   }
 
   if (host_memory_limited && gpu_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in host and GPU memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in host or GPU memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (host_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in host memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in host memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (gpu_memory_limited) {
-    RAFT_LOG_INFO(
-      "ACE: Graph does not fit in GPU memory. Using disk-mode with temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph does not fit in GPU memory; using disk mode at %s",
+                  build_dir.c_str());
   } else if (use_disk) {
-    RAFT_LOG_INFO(
-      "ACE: Graph fits in host and GPU memory but disk mode is forced. Using disk-mode with "
-      "temporary storage %s",
-      build_dir.c_str());
+    RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory but disk mode is forced; using %s",
+                  build_dir.c_str());
   } else {
-    RAFT_LOG_INFO("ACE: Graph fits in host and GPU memory. Using in-memory mode.");
+    RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory; using in-memory mode");
   }
 
   return use_disk_mode;
@@ -1005,8 +1012,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
       disk_mode_host_required) {
     host_memory_insufficient = true;
     RAFT_LOG_WARN(
-      "ACE: Host memory insufficient for disk mode. Required: %.2f GiB, available: %.2f GiB. "
-      "Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, workspace %.2f GiB",
+      "ACE build: host memory is insufficient for disk mode: required=%.2f GiB available=%.2f "
+      "GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)",
       to_gib(disk_mode_host_required),
       to_gib(mem.available_host_memory),
       to_gib(mem.sub_dataset_size),
@@ -1017,8 +1024,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
     double available_for_scaling = usable_cpu_memory_fraction * mem.available_host_memory -
                                    mem.partition_labels_size - mem.id_mapping_size;
     RAFT_EXPECTS(available_for_scaling > 0,
-                 "ACE: Host memory insufficient even for constant overhead (labels + id_mapping). "
-                 "Required: %.2f GiB, available: %.2f GiB",
+                 "ACE build: host memory is insufficient even for constant overhead "
+                 "(labels + id_mapping): required=%.2f GiB available=%.2f GiB",
                  to_gib(mem.partition_labels_size + mem.id_mapping_size),
                  to_gib(usable_cpu_memory_fraction * mem.available_host_memory));
     host_suggested_partitions = static_cast<size_t>(
@@ -1040,9 +1047,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
       disk_mode_gpu_required) {
     gpu_memory_insufficient = true;
     RAFT_LOG_WARN(
-      "ACE: GPU memory insufficient for per-partition processing. Required: %.2f GiB, "
-      "available: %.2f GiB. Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, "
-      "workspace %.2f GiB",
+      "ACE build: GPU memory is insufficient for partition processing: required=%.2f GiB "
+      "available=%.2f GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)",
       to_gib(disk_mode_gpu_required),
       to_gib(mem.available_gpu_memory),
       to_gib(mem.sub_dataset_size),
@@ -1060,8 +1066,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
     size_t new_n_partitions = std::max(host_suggested_partitions, gpu_suggested_partitions);
 
     RAFT_LOG_WARN(
-      "ACE: Automatically increasing number of partitions from %zu to %zu to satisfy memory "
-      "constraints.%s%s",
+      "ACE build: increasing partitions from %zu to %zu to satisfy memory constraints.%s%s",
       original_n_partitions,
       new_n_partitions,
       host_memory_insufficient
@@ -1088,8 +1093,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions,
                                                                      guarantee_connectivity);
 
     RAFT_LOG_INFO(
-      "ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, "
-      "host workspace %.2f GiB, GPU workspace %.2f GiB",
+      "ACE build: updated partition memory estimates dataset=%.2f GiB graph=%.2f GiB host "
+      "workspace=%.2f GiB GPU workspace=%.2f GiB",
       to_gib(mem.sub_dataset_size),
       to_gib(mem.sub_graph_size),
       to_gib(new_opt_host_ws),
@@ -1131,15 +1136,15 @@ index<T, IdxT> build_ace(raft::resources const& res,
   size_t dataset_size = dataset.extent(0);
   size_t dataset_dim  = dataset.extent(1);
 
-  RAFT_EXPECTS(dataset_size > 0, "ACE: Dataset must not be empty");
+  RAFT_EXPECTS(dataset_size > 0, "ACE build: dataset must not be empty");
   if (dataset_size < 1000) {
-    RAFT_LOG_WARN("ACE: Very small dataset size (%zu), consider using regular CAGRA build instead.",
+    RAFT_LOG_WARN("ACE build: very small dataset size (%zu); regular CAGRA may be simpler",
                   dataset_size);
   }
-  RAFT_EXPECTS(dataset_dim > 0, "ACE: Dataset dimension must be greater than 0");
+  RAFT_EXPECTS(dataset_dim > 0, "ACE build: dataset dimension must be greater than 0");
   RAFT_EXPECTS(params.intermediate_graph_degree > 0,
-               "ACE: Intermediate graph degree must be greater than 0");
-  RAFT_EXPECTS(params.graph_degree > 0, "ACE: Graph degree must be greater than 0");
+               "ACE build: intermediate graph degree must be greater than 0");
+  RAFT_EXPECTS(params.graph_degree > 0, "ACE build: graph degree must be greater than 0");
 
   size_t n_partitions = npartitions;
   if (n_partitions == 0) {
@@ -1152,17 +1157,22 @@ index<T, IdxT> build_ace(raft::resources const& res,
     n_partitions = dataset_size / min_required_per_partition;
     if (n_partitions < 2) {
       RAFT_LOG_WARN(
-        "ACE: Reduced number of partitions to the minimum of 2 to avoid tiny partitions. Consider "
-        "using regular CAGRA build instead.");
+        "ACE build: reduced partitions to the minimum of 2 to avoid tiny partitions; regular "
+        "CAGRA may be simpler");
       n_partitions = 2;
     } else {
-      RAFT_LOG_WARN("ACE: Reduced number of partitions to %zu to avoid tiny partitions",
-                    n_partitions);
+      RAFT_LOG_WARN("ACE build: reduced partitions to %zu to avoid tiny partitions", n_partitions);
     }
   }
 
   auto total_start = std::chrono::high_resolution_clock::now();
-  RAFT_LOG_INFO("ACE: Starting partitioned CAGRA build with %zu partitions", n_partitions);
+  RAFT_LOG_INFO(
+    "ACE build: start rows=%zu dim=%zu partitions=%zu graph_degree=%zu intermediate_degree=%zu",
+    dataset_size,
+    dataset_dim,
+    n_partitions,
+    static_cast<size_t>(params.graph_degree),
+    static_cast<size_t>(params.intermediate_graph_degree));
 
   size_t intermediate_degree = params.intermediate_graph_degree;
   size_t graph_degree        = params.graph_degree;
@@ -1228,7 +1238,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
         build_dir + "/cagra_graph.npy", {dataset_size, graph_degree});
 
       RAFT_LOG_DEBUG(
-        "ACE: Wrote numpy headers (reordered: %zu, augmented: %zu, mapping: %zu, graph: %zu bytes)",
+        "ACE build: wrote numpy headers (reordered=%zu augmented=%zu mapping=%zu graph=%zu bytes)",
         reordered_header_size,
         augmented_header_size,
         mapping_header_size,
@@ -1259,11 +1269,9 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto partition_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(partition_end - partition_start)
         .count();
-    RAFT_LOG_INFO(
-      "ACE: Partition labeling completed in %ld ms (min_partition_size: "
-      "%lu)",
-      partition_elapsed,
-      min_partition_size);
+    RAFT_LOG_INFO("ACE build: partition labeling completed in %ld ms (min_partition_size=%lu)",
+                  partition_elapsed,
+                  min_partition_size);
 
     // Create vector lists for each partition
     auto vectorlist_start      = std::chrono::high_resolution_clock::now();
@@ -1288,7 +1296,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto vectorlist_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(vectorlist_end - vectorlist_start)
         .count();
-    RAFT_LOG_INFO("ACE: Vector list creation completed in %ld ms", vectorlist_elapsed);
+    RAFT_LOG_INFO("ACE build: partition mapping completed in %ld ms", vectorlist_elapsed);
 
     // Reorder the dataset based on partitions and store to disk. Uses write buffers to improve
     // performance.
@@ -1319,7 +1327,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     // Process each partition
     auto partition_processing_start = std::chrono::high_resolution_clock::now();
     for (size_t partition_id = 0; partition_id < n_partitions; partition_id++) {
-      RAFT_LOG_DEBUG("ACE: Processing partition %lu/%lu", partition_id + 1, n_partitions);
+      RAFT_LOG_DEBUG("ACE build: processing partition %lu/%lu", partition_id + 1, n_partitions);
       auto start = std::chrono::high_resolution_clock::now();
 
       // Extract vectors for this partition
@@ -1328,10 +1336,11 @@ index<T, IdxT> build_ace(raft::resources const& res,
       size_t sub_dataset_size           = core_sub_dataset_size + augmented_sub_dataset_size;
 
       if (sub_dataset_size == 0) {
-        RAFT_LOG_WARN("ACE: Skipping empty partition %lu", partition_id);
+        RAFT_LOG_WARN("ACE build: Skipping empty partition %lu", partition_id);
         continue;
       }
-      RAFT_LOG_DEBUG("ACE: Sub-dataset size: %lu (%lu + %lu)",
+      RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)",
+                     partition_id,
                      sub_dataset_size,
                      core_sub_dataset_size,
                      augmented_sub_dataset_size);
@@ -1444,12 +1453,12 @@ index<T, IdxT> build_ace(raft::resources const& res,
           ? to_mib(core_sub_dataset_size * dataset_dim * sizeof(T)) / (write_elapsed / 1000.0)
           : 0.0;
       RAFT_LOG_INFO(
-        "ACE: Partition %4lu (%8lu + %8lu) completed in %6ld ms: read %6ld ms (%7.1f MiB/s), "
-        "optimize %6ld ms, adjust %6ld ms, write %6ld ms (%7.1f MiB/s)",
+        "ACE build: partition %4lu completed in %6ld ms (core=%8lu augmented=%8lu read=%6ld ms "
+        "%.1f MiB/s optimize=%6ld ms adjust=%6ld ms write=%6ld ms %.1f MiB/s)",
         partition_id,
+        elapsed_ms,
         core_sub_dataset_size,
         augmented_sub_dataset_size,
-        elapsed_ms,
         read_elapsed,
         read_throughput,
         optimize_elapsed,
@@ -1462,7 +1471,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
     auto partition_processing_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
                                           partition_processing_end - partition_processing_start)
                                           .count();
-    RAFT_LOG_INFO("ACE: All partition processing completed in %ld ms (%zu partitions)",
+    RAFT_LOG_INFO("ACE build: partition graph build completed in %ld ms (partitions=%zu)",
                   partition_processing_elapsed,
                   n_partitions);
 
@@ -1472,7 +1481,7 @@ index<T, IdxT> build_ace(raft::resources const& res,
       const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy";
       if (std::filesystem::exists(augmented_dataset_path)) {
         std::filesystem::remove(augmented_dataset_path);
-        RAFT_LOG_INFO("ACE: Removed augmented dataset file to save disk space");
+        RAFT_LOG_INFO("ACE build: removed temporary augmented dataset");
       }
     }
 
@@ -1488,12 +1497,12 @@ index<T, IdxT> build_ace(raft::resources const& res,
           idx.update_dataset(res, dataset);
         } catch (std::bad_alloc& e) {
           RAFT_LOG_WARN(
-            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
-            "stored.");
+            "ACE build: insufficient GPU memory to attach dataset to index; only the graph will "
+            "be stored");
         } catch (raft::logic_error& e) {
           RAFT_LOG_WARN(
-            "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be "
-            "stored.");
+            "ACE build: insufficient GPU memory to attach dataset to index; only the graph will "
+            "be stored");
         }
       }
     } else {
@@ -1501,37 +1510,36 @@ index<T, IdxT> build_ace(raft::resources const& res,
       idx.update_graph(res, std::move(graph_fd));
       idx.update_mapping(res, std::move(mapping_fd));
 
-      RAFT_LOG_INFO(
-        "ACE: Set disk storage at %s (dataset shape [%zu, %zu], graph shape [%zu, %zu])",
-        build_dir.c_str(),
-        idx.size(),
-        idx.dim(),
-        idx.size(),
-        idx.graph_degree());
+      RAFT_LOG_INFO("ACE build: disk artifacts ready at %s (dataset=[%zu,%zu] graph=[%zu,%zu])",
+                    build_dir.c_str(),
+                    idx.size(),
+                    idx.dim(),
+                    idx.size(),
+                    idx.graph_degree());
     }
 
     auto index_creation_end     = std::chrono::high_resolution_clock::now();
     auto index_creation_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
                                     index_creation_end - index_creation_start)
                                     .count();
-    RAFT_LOG_INFO("ACE: Final index creation completed in %ld ms", index_creation_elapsed);
+    RAFT_LOG_INFO("ACE build: final index initialized in %ld ms", index_creation_elapsed);
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed =
       std::chrono::duration_cast<std::chrono::milliseconds>(total_end - total_start).count();
-    RAFT_LOG_INFO("ACE: Partitioned CAGRA build completed in %ld ms total", total_elapsed);
+    RAFT_LOG_INFO("ACE build: completed in %ld ms", total_elapsed);
 
     return idx;
   } catch (const std::exception& e) {
     // Clean up build directory on failure if we created it
-    RAFT_LOG_ERROR("ACE: Build failed with exception: %s", e.what());
+    RAFT_LOG_ERROR("ACE build: failed with exception: %s", e.what());
     if (cleanup_on_failure && !build_dir.empty()) {
-      RAFT_LOG_INFO("ACE: Cleaning up build directory: %s", build_dir.c_str());
+      RAFT_LOG_INFO("ACE build: cleaning up build directory %s", build_dir.c_str());
       try {
         std::filesystem::remove_all(build_dir);
-        RAFT_LOG_INFO("ACE: Successfully removed build directory");
+        RAFT_LOG_INFO("ACE build: removed build directory");
       } catch (const std::exception& cleanup_error) {
-        RAFT_LOG_WARN("ACE: Failed to clean up build directory: %s", cleanup_error.what());
+        RAFT_LOG_WARN("ACE build: failed to clean up build directory: %s", cleanup_error.what());
       }
     }
     // Re-throw the original exception
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 4914a0fa1b..f701ba962d 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -23,15 +23,28 @@
 #include <hnswlib/hnswalg.h>
 #include <hnswlib/hnswlib.h>
 
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
 #include <fcntl.h>
 #include <filesystem>
 #include <fstream>
+#include <limits>
 #include <memory>
+#include <mutex>
 #include <new>
+#include <numeric>
+#include <optional>
 #include <random>
-#include <sys/mman.h>
+#include <stdexcept>
 #include <thread>
+#include <type_traits>
 #include <unistd.h>
+#include <utility>
+#include <vector>
 
 namespace cuvs::neighbors::hnsw::detail {
 
@@ -166,6 +179,9 @@ struct index_impl : index<T> {
     RAFT_LOG_INFO("Loading HNSW index from disk: %s", filepath.c_str());
 
     try {
+      RAFT_EXPECTS(this->hierarchy() != HnswHierarchy::GPU_LAYERED_ON_DISK,
+                   "Layered HNSW indexes must be loaded with hnsw::deserialize so a local dataset "
+                   "can be provided through index_params.dataset_path.");
       appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
         space_.get(), filepath);
       if (this->hierarchy() == HnswHierarchy::NONE) { appr_alg_->base_layer_only = true; }
@@ -341,6 +357,861 @@ void all_neighbors_graph(raft::resources const& res,
   }
 }
 
+struct hnsw_level_plan {
+  size_t n_rows = 0;
+  std::vector<size_t> hist;
+  // Bucket-end offsets after order construction. Level L starts at offsets[L - 1].
+  std::vector<size_t> offsets;
+  std::vector<size_t> order;
+  std::vector<size_t> order_bw;
+  std::vector<uint8_t> levels;
+
+  [[nodiscard]] auto max_level() const -> int
+  {
+    return hist.empty() ? 0 : static_cast<int>(hist.size() - 1);
+  }
+
+  [[nodiscard]] auto promoted_count() const -> size_t
+  {
+    return hist.empty() ? 0 : n_rows - hist[0];
+  }
+};
+
+inline auto hnsw_upper_layer_degree(size_t row_count, size_t M) -> size_t
+{
+  return row_count > M ? M : row_count > 0 ? row_count - 1 : 0;
+}
+
+inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int64_t
+{
+  return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() -
+                                                               start)
+    .count();
+}
+
+inline auto elapsed_ms(std::chrono::steady_clock::duration elapsed) -> double
+{
+  return std::chrono::duration<double, std::milli>(elapsed).count();
+}
+
+inline auto to_gib(size_t bytes) -> double
+{
+  return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0);
+}
+
+inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double
+{
+  return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0;
+}
+
+inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::duration elapsed)
+  -> double
+{
+  const auto elapsed_s = std::chrono::duration<double>(elapsed).count();
+  return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0;
+}
+
+inline auto progress_step_10(size_t completed, size_t total) -> size_t
+{
+  if (total == 0) { return 100; }
+  const auto percent = (std::min(completed, total) * 100) / total;
+  return completed >= total ? 100 : (percent / 10) * 10;
+}
+
+inline auto make_hnsw_level_plan_from_levels(size_t n_rows,
+                                             std::vector<uint8_t>&& levels,
+                                             bool build_reverse_order,
+                                             const char* log_prefix = nullptr) -> hnsw_level_plan
+{
+  hnsw_level_plan plan;
+  plan.n_rows = n_rows;
+  plan.levels = std::move(levels);
+  RAFT_EXPECTS(n_rows > 0, "HNSW hierarchy requires at least one row");
+  RAFT_EXPECTS(plan.levels.size() == n_rows,
+               "HNSW level count (%zu) must match row count (%zu)",
+               plan.levels.size(),
+               n_rows);
+
+  for (auto level : plan.levels) {
+    while (static_cast<size_t>(level) >= plan.hist.size()) {
+      plan.hist.push_back(0);
+    }
+    plan.hist[level]++;
+  }
+
+  plan.offsets.resize(plan.hist.size() + 1, 0);
+  for (size_t i = 0; i < plan.hist.size() - 1; ++i) {
+    plan.offsets[i + 1] = plan.offsets[i] + plan.hist[i];
+    if (log_prefix != nullptr) {
+      RAFT_LOG_INFO("%s%zu : %zu", log_prefix, i + 1, n_rows - plan.offsets[i + 1]);
+    }
+  }
+
+  plan.order.resize(n_rows);
+  if (build_reverse_order) { plan.order_bw.resize(n_rows); }
+  for (size_t i = 0; i < n_rows; ++i) {
+    const auto level = static_cast<size_t>(plan.levels[i]);
+    if (build_reverse_order) { plan.order_bw[i] = plan.offsets[level]; }
+    plan.order[plan.offsets[level]++] = i;
+  }
+
+  return plan;
+}
+
+template <typename HnswAlgo>
+auto make_random_hnsw_level_plan(size_t n_rows, HnswAlgo& appr_algo, const char* log_prefix)
+  -> hnsw_level_plan
+{
+  std::vector<uint8_t> levels(n_rows);
+  for (int64_t i = 0; i < static_cast<int64_t>(n_rows); ++i) {
+    const auto pt_level = appr_algo.getRandomLevel(appr_algo.mult_);
+    RAFT_EXPECTS(pt_level <= std::numeric_limits<uint8_t>::max(),
+                 "HNSW serialization only supports levels up to %u",
+                 static_cast<unsigned int>(std::numeric_limits<uint8_t>::max()));
+    levels[i] = static_cast<uint8_t>(pt_level);
+  }
+
+  return make_hnsw_level_plan_from_levels(n_rows, std::move(levels), true, log_prefix);
+}
+
+template <typename T, typename IdxT, typename Callback>
+void build_hnsw_upper_layer_graphs(
+  raft::resources const& res,
+  raft::host_matrix_view<const T, int64_t, raft::row_major> promoted_dataset,
+  const hnsw_level_plan& plan,
+  size_t M,
+  cuvs::distance::DistanceType metric,
+  Callback&& callback)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>,
+                "HNSW upper-layer graph construction expects uint32_t neighbor ids");
+
+  const auto dim = static_cast<size_t>(promoted_dataset.extent(1));
+  for (size_t pt_level = 1; pt_level < plan.hist.size(); ++pt_level) {
+    const auto start_idx     = plan.offsets[pt_level - 1];
+    const auto removed_rows  = start_idx - plan.offsets[0];
+    const auto row_count     = plan.n_rows - start_idx;
+    const auto neighbor_size = hnsw_upper_layer_degree(row_count, M);
+
+    RAFT_LOG_INFO("Layered HNSW artifact: computing upper-layer neighbors level=%zu", pt_level);
+    auto host_neighbors = raft::make_host_matrix<IdxT, int64_t>(
+      static_cast<int64_t>(row_count), static_cast<int64_t>(neighbor_size));
+
+    if (neighbor_size > 0) {
+      auto layer_dataset_view = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+        promoted_dataset.data_handle() + removed_rows * dim,
+        static_cast<int64_t>(row_count),
+        promoted_dataset.extent(1));
+      all_neighbors_graph(res, layer_dataset_view, host_neighbors.view(), metric);
+    }
+
+    callback(pt_level, start_idx, host_neighbors);
+  }
+}
+
+struct layered_hnsw_layer_info {
+  size_t level       = 0;
+  size_t row_count   = 0;
+  size_t degree      = 0;
+  size_t node_offset = 0;
+  size_t link_offset = 0;
+};
+
+struct layered_hnsw_file_metadata {
+  size_t n_rows               = 0;
+  size_t dim                  = 0;
+  size_t M                    = 0;
+  size_t maxM                 = 0;
+  size_t maxM0                = 0;
+  size_t ef_construction      = 0;
+  size_t base_degree          = 0;
+  size_t levels_bytes         = 0;
+  size_t base_nodes_bytes     = 0;
+  size_t base_link_row_bytes  = 0;
+  size_t base_links_bytes     = 0;
+  size_t upper_nodes_count    = 0;
+  size_t upper_nodes_bytes    = 0;
+  size_t upper_link_row_bytes = 0;
+  size_t upper_links_bytes    = 0;
+  double mult                 = 0.0;
+  int maxlevel                = 0;
+  int enterpoint_node         = 0;
+  std::vector<layered_hnsw_layer_info> layers;
+};
+
+enum class layered_hnsw_dtype : uint32_t {
+  unknown = 0,
+  float32 = 1,
+  float16 = 2,
+  uint8   = 3,
+  int8    = 4,
+};
+
+// The layered HNSW artifact begins with this fixed-size POD header, immediately followed by
+// `num_layers` layered_hnsw_layer_descriptor records and then the payload sections.
+struct layered_hnsw_file_header {
+  char magic[32];
+  uint64_t n_rows;
+  uint64_t dim;
+  uint64_t M;
+  uint64_t maxM;
+  uint64_t maxM0;
+  uint64_t ef_construction;
+  uint64_t base_degree;
+  uint64_t levels_bytes;
+  uint64_t base_nodes_bytes;
+  uint64_t base_link_row_bytes;
+  uint64_t base_links_bytes;
+  uint64_t upper_nodes_count;
+  uint64_t upper_nodes_bytes;
+  uint64_t upper_link_row_bytes;
+  uint64_t upper_links_bytes;
+  double mult;
+  uint32_t version;
+  uint32_t dtype;
+  uint32_t metric;
+  uint32_t num_layers;
+  int32_t maxlevel;
+  int32_t enterpoint_node;
+  uint32_t reserved0;
+  uint32_t reserved1;
+};
+static_assert(sizeof(layered_hnsw_file_header) == 192,
+              "layered_hnsw_file_header must keep a fixed 192-byte on-disk layout");
+
+struct layered_hnsw_layer_descriptor {
+  uint64_t level;
+  uint64_t row_count;
+  uint64_t degree;
+  uint64_t node_offset;
+  uint64_t link_offset;
+};
+static_assert(sizeof(layered_hnsw_layer_descriptor) == 40,
+              "layered_hnsw_layer_descriptor must keep a fixed 40-byte on-disk layout");
+
+constexpr const char* layered_hnsw_magic = "CUVS_HNSW_LAYERED";
+constexpr uint32_t layered_hnsw_version  = 1;
+constexpr size_t layered_hnsw_alignment  = 64;
+
+inline auto align_up(size_t value, size_t alignment) -> size_t
+{
+  return ((value + alignment - 1) / alignment) * alignment;
+}
+
+template <typename T>
+constexpr auto layered_dtype_code() -> layered_hnsw_dtype
+{
+  if constexpr (std::is_same_v<T, float>) {
+    return layered_hnsw_dtype::float32;
+  } else if constexpr (std::is_same_v<T, half>) {
+    return layered_hnsw_dtype::float16;
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    return layered_hnsw_dtype::uint8;
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    return layered_hnsw_dtype::int8;
+  } else {
+    return layered_hnsw_dtype::unknown;
+  }
+}
+
+inline auto layered_dtype_name(layered_hnsw_dtype dtype) -> const char*
+{
+  switch (dtype) {
+    case layered_hnsw_dtype::float32: return "float32";
+    case layered_hnsw_dtype::float16: return "float16";
+    case layered_hnsw_dtype::uint8: return "uint8";
+    case layered_hnsw_dtype::int8: return "int8";
+    default: return "unknown";
+  }
+}
+
+inline auto metric_name(cuvs::distance::DistanceType metric) -> const char*
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded: return "L2Expanded";
+    case cuvs::distance::DistanceType::InnerProduct: return "InnerProduct";
+    default: return "Unknown";
+  }
+}
+
+template <typename T>
+auto make_layered_hnsw_header(const layered_hnsw_file_metadata& metadata,
+                              cuvs::distance::DistanceType metric) -> layered_hnsw_file_header
+{
+  layered_hnsw_file_header header{};
+  std::strncpy(header.magic, layered_hnsw_magic, sizeof(header.magic) - 1);
+  header.version              = layered_hnsw_version;
+  header.dtype                = static_cast<uint32_t>(layered_dtype_code<T>());
+  header.metric               = static_cast<uint32_t>(metric);
+  header.num_layers           = static_cast<uint32_t>(metadata.layers.size());
+  header.n_rows               = metadata.n_rows;
+  header.dim                  = metadata.dim;
+  header.M                    = metadata.M;
+  header.maxM                 = metadata.maxM;
+  header.maxM0                = metadata.maxM0;
+  header.ef_construction      = metadata.ef_construction;
+  header.base_degree          = metadata.base_degree;
+  header.levels_bytes         = metadata.levels_bytes;
+  header.base_nodes_bytes     = metadata.base_nodes_bytes;
+  header.base_link_row_bytes  = metadata.base_link_row_bytes;
+  header.base_links_bytes     = metadata.base_links_bytes;
+  header.upper_nodes_count    = metadata.upper_nodes_count;
+  header.upper_nodes_bytes    = metadata.upper_nodes_bytes;
+  header.upper_link_row_bytes = metadata.upper_link_row_bytes;
+  header.upper_links_bytes    = metadata.upper_links_bytes;
+  header.mult                 = metadata.mult;
+  header.maxlevel             = metadata.maxlevel;
+  header.enterpoint_node      = metadata.enterpoint_node;
+  return header;
+}
+
+inline auto layered_hnsw_metadata_from_header(const layered_hnsw_file_header& header)
+  -> layered_hnsw_file_metadata
+{
+  layered_hnsw_file_metadata metadata;
+  metadata.n_rows               = header.n_rows;
+  metadata.dim                  = header.dim;
+  metadata.M                    = header.M;
+  metadata.maxM                 = header.maxM;
+  metadata.maxM0                = header.maxM0;
+  metadata.ef_construction      = header.ef_construction;
+  metadata.base_degree          = header.base_degree;
+  metadata.levels_bytes         = header.levels_bytes;
+  metadata.base_nodes_bytes     = header.base_nodes_bytes;
+  metadata.base_link_row_bytes  = header.base_link_row_bytes;
+  metadata.base_links_bytes     = header.base_links_bytes;
+  metadata.upper_nodes_count    = header.upper_nodes_count;
+  metadata.upper_nodes_bytes    = header.upper_nodes_bytes;
+  metadata.upper_link_row_bytes = header.upper_link_row_bytes;
+  metadata.upper_links_bytes    = header.upper_links_bytes;
+  metadata.mult                 = header.mult;
+  metadata.maxlevel             = header.maxlevel;
+  metadata.enterpoint_node      = header.enterpoint_node;
+  return metadata;
+}
+
+struct npy_file {
+  cuvs::util::file_descriptor fd;
+  size_t header_size = 0;
+  std::vector<size_t> shape;
+};
+
+inline auto open_npy_file(const std::string& path) -> npy_file
+{
+  std::ifstream stream(path, std::ios::binary);
+  RAFT_EXPECTS(stream.good(), "Failed to open numpy file: %s", path.c_str());
+  auto header      = raft::detail::numpy_serializer::read_header(stream);
+  auto header_size = static_cast<size_t>(stream.tellg());
+  auto fd          = cuvs::util::file_descriptor(path, O_RDONLY);
+  return {std::move(fd), header_size, header.shape};
+}
+
+inline auto ends_with(const std::string& value, const std::string& suffix) -> bool
+{
+  return value.size() >= suffix.size() &&
+         value.compare(value.size() - suffix.size(), suffix.size(), suffix) == 0;
+}
+
+inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst)
+{
+  std::error_code ec;
+  if (src == dst ||
+      (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) {
+    return;
+  }
+  if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); }
+  std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing);
+}
+
+template <typename T>
+inline auto open_layered_dataset_file(const std::string& path) -> npy_file
+{
+  if (std::filesystem::path(path).extension() == ".npy") { return open_npy_file(path); }
+
+  std::ifstream stream(path, std::ios::binary);
+  RAFT_EXPECTS(stream.good(), "Failed to open dataset file: %s", path.c_str());
+  std::array<uint32_t, 2> header{};
+  stream.read(reinterpret_cast<char*>(header.data()), sizeof(uint32_t) * header.size());
+  RAFT_EXPECTS(stream.gcount() == static_cast<std::streamsize>(sizeof(uint32_t) * header.size()),
+               "Failed to read ANN benchmark binary dataset header: %s",
+               path.c_str());
+
+  const auto ext = std::filesystem::path(path).extension().string();
+  if constexpr (std::is_same_v<T, float>) {
+    RAFT_EXPECTS(ext == ".fbin" && !ends_with(path, ".fp16.fbin"),
+                 "Expected a .fbin dataset for float layered HNSW load: %s",
+                 path.c_str());
+  } else if constexpr (std::is_same_v<T, half>) {
+    RAFT_EXPECTS(ext == ".f16bin" || ends_with(path, ".fp16.fbin"),
+                 "Expected a .f16bin or .fp16.fbin dataset for half layered HNSW load: %s",
+                 path.c_str());
+  } else if constexpr (std::is_same_v<T, uint8_t>) {
+    RAFT_EXPECTS(
+      ext == ".u8bin", "Expected a .u8bin dataset for uint8 layered HNSW load: %s", path.c_str());
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    RAFT_EXPECTS(
+      ext == ".i8bin", "Expected a .i8bin dataset for int8 layered HNSW load: %s", path.c_str());
+  }
+
+  auto fd = cuvs::util::file_descriptor(path, O_RDONLY);
+  return {std::move(fd), sizeof(uint32_t) * header.size(), {header[0], header[1]}};
+}
+
+template <typename T, typename IdxT>
+void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index<T, IdxT>& index_,
+                                        const cuvs::util::file_descriptor& output_fd,
+                                        size_t base_nodes_offset,
+                                        size_t base_links_offset,
+                                        size_t base_link_row_bytes,
+                                        size_t maxM0)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>,
+                "Layered HNSW artifacts store topology ids as uint32_t");
+  const auto& graph_fd_opt   = index_.graph_fd();
+  const auto& mapping_fd_opt = index_.mapping_fd();
+  RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(),
+               "Graph file descriptor is not available");
+  RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(),
+               "Mapping file descriptor is not available");
+
+  const auto graph_path   = graph_fd_opt->get_path();
+  const auto mapping_path = mapping_fd_opt->get_path();
+  RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+  RAFT_EXPECTS(!mapping_path.empty(), "Unable to get path from mapping file descriptor");
+
+  auto graph_npy   = open_npy_file(graph_path);
+  auto mapping_npy = open_npy_file(mapping_path);
+
+  RAFT_EXPECTS(graph_npy.shape.size() == 2, "Graph file should be 2D");
+  RAFT_EXPECTS(mapping_npy.shape.size() == 1, "Mapping file should be 1D");
+  const auto n_rows = graph_npy.shape[0];
+  const auto degree = graph_npy.shape[1];
+  RAFT_EXPECTS(mapping_npy.shape[0] == n_rows,
+               "Mapping elements (%zu) != graph rows (%zu)",
+               mapping_npy.shape[0],
+               n_rows);
+  RAFT_EXPECTS(n_rows == static_cast<size_t>(index_.size()),
+               "Graph rows (%zu) != index size (%zu)",
+               n_rows,
+               static_cast<size_t>(index_.size()));
+  RAFT_EXPECTS(degree == static_cast<size_t>(index_.graph_degree()),
+               "Graph degree (%zu) != index graph degree (%zu)",
+               degree,
+               static_cast<size_t>(index_.graph_degree()));
+  RAFT_EXPECTS(degree > 0, "Graph degree must be nonzero");
+  RAFT_EXPECTS(
+    degree <= maxM0, "Base graph degree (%zu) must not exceed HNSW maxM0 (%zu)", degree, maxM0);
+  RAFT_EXPECTS(base_link_row_bytes >= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT),
+               "Base link row size is too small");
+
+  RAFT_LOG_INFO("Layered HNSW artifact: loading ACE row mapping (%zu rows)", n_rows);
+  const auto mapping_start_time = std::chrono::steady_clock::now();
+  const auto mapping_bytes      = n_rows * sizeof(IdxT);
+  std::vector<IdxT> reordered_to_original(n_rows);
+  cuvs::util::read_large_file(
+    mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size);
+  for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) {
+    const auto original_id = static_cast<size_t>(reordered_to_original[reordered_id]);
+    RAFT_EXPECTS(original_id < n_rows,
+                 "Invalid original id %zu in ACE dataset mapping at row %zu",
+                 original_id,
+                 reordered_id);
+  }
+  const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time);
+  RAFT_LOG_INFO("Layered HNSW artifact: ACE row mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)",
+                mapping_elapsed_ms,
+                to_gib(mapping_bytes),
+                throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms));
+
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: writing base topology with source-sequential graph reads "
+    "(rows=%zu degree=%zu maxM0=%zu)",
+    n_rows,
+    degree,
+    maxM0);
+
+  const auto total_start_time       = std::chrono::steady_clock::now();
+  const auto graph_row_bytes        = degree * sizeof(IdxT);
+  const auto base_node_row_bytes    = sizeof(IdxT);
+  const auto base_topology_row_size = graph_row_bytes + base_node_row_bytes + base_link_row_bytes;
+  const size_t target_batch_bytes   = 64 * 1024 * 1024;
+  const size_t batch_size = std::max<size_t>(1, target_batch_bytes / base_topology_row_size);
+  auto graph_buffer       = raft::make_host_matrix<IdxT, int64_t>(static_cast<int64_t>(batch_size),
+                                                            static_cast<int64_t>(degree));
+  std::vector<IdxT> base_node_buffer(batch_size);
+  std::vector<char> base_link_buffer(batch_size * base_link_row_bytes);
+
+  size_t graph_bytes_read   = 0;
+  size_t node_bytes_written = 0;
+  size_t link_bytes_written = 0;
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: base topology batch_size=%zu graph_row=%zu bytes node_row=%zu bytes "
+    "link_row=%zu bytes",
+    batch_size,
+    graph_row_bytes,
+    base_node_row_bytes,
+    base_link_row_bytes);
+  size_t next_report_percent = 10;
+  for (size_t source_start = 0; source_start < n_rows; source_start += batch_size) {
+    const auto current_batch_size = std::min(batch_size, n_rows - source_start);
+    const auto batch_bytes        = current_batch_size * graph_row_bytes;
+    cuvs::util::read_large_file(graph_npy.fd,
+                                graph_buffer.data_handle(),
+                                batch_bytes,
+                                graph_npy.header_size + source_start * graph_row_bytes);
+    graph_bytes_read += batch_bytes;
+
+    // Rows stay in source (ACE-reordered) order so the artifact is written sequentially on the
+    // build node. Each row records its original ID in base_nodes and remaps its neighbors to
+    // original IDs. The search node scatters each row into get_linklist0(original_id) on load.
+    bool invalid_neighbor = false;
+#pragma omp parallel for reduction(|| : invalid_neighbor)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto source_row       = source_start + static_cast<size_t>(batch_idx);
+      const auto original_id      = static_cast<size_t>(reordered_to_original[source_row]);
+      base_node_buffer[batch_idx] = static_cast<IdxT>(original_id);
+      auto* dst_row_ptr           = base_link_buffer.data() + batch_idx * base_link_row_bytes;
+      hnswlib::linklistsizeint list_count = static_cast<hnswlib::linklistsizeint>(degree);
+      std::memcpy(dst_row_ptr, &list_count, sizeof(list_count));
+      auto* dst = reinterpret_cast<IdxT*>(dst_row_ptr + sizeof(hnswlib::linklistsizeint));
+      auto* src = graph_buffer.data_handle() + batch_idx * degree;
+      for (size_t j = 0; j < degree; ++j) {
+        const auto neighbor = static_cast<size_t>(src[j]);
+        if (neighbor >= n_rows) {
+          invalid_neighbor = true;
+          continue;
+        }
+        dst[j] = reordered_to_original[neighbor];
+      }
+      const auto written_bytes = sizeof(hnswlib::linklistsizeint) + degree * sizeof(IdxT);
+      if (written_bytes < base_link_row_bytes) {
+        std::memset(dst_row_ptr + written_bytes, 0, base_link_row_bytes - written_bytes);
+      }
+    }
+    RAFT_EXPECTS(!invalid_neighbor, "Invalid reordered neighbor id in ACE graph");
+
+    const auto current_node_bytes = current_batch_size * sizeof(IdxT);
+    const auto current_link_bytes = current_batch_size * base_link_row_bytes;
+    cuvs::util::write_large_file(output_fd,
+                                 base_node_buffer.data(),
+                                 current_node_bytes,
+                                 base_nodes_offset + source_start * sizeof(IdxT));
+    cuvs::util::write_large_file(output_fd,
+                                 base_link_buffer.data(),
+                                 current_link_bytes,
+                                 base_links_offset + source_start * base_link_row_bytes);
+    node_bytes_written += current_node_bytes;
+    link_bytes_written += current_link_bytes;
+
+    const auto completed_rows = source_start + current_batch_size;
+    const auto report_percent = progress_step_10(completed_rows, n_rows);
+    if (report_percent >= next_report_percent) {
+      const auto elapsed = elapsed_ms_since(total_start_time);
+      RAFT_LOG_INFO(
+        "Layered HNSW artifact: base topology progress %zu%% (%zu/%zu rows, read=%.2f GiB "
+        "write=%.2f GiB, %ld ms)",
+        report_percent,
+        completed_rows,
+        n_rows,
+        to_gib(graph_bytes_read),
+        to_gib(node_bytes_written + link_bytes_written),
+        elapsed);
+      next_report_percent = report_percent + 10;
+    }
+  }
+
+  const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, "
+    "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)",
+    total_elapsed_ms,
+    to_gib(graph_bytes_read),
+    throughput_gib_per_s(graph_bytes_read, total_elapsed_ms),
+    to_gib(node_bytes_written + link_bytes_written),
+    throughput_gib_per_s(node_bytes_written + link_bytes_written, total_elapsed_ms),
+    to_gib(node_bytes_written),
+    to_gib(link_bytes_written));
+}
+
+template <typename T, typename IdxT>
+auto serialize_to_layered_hnsw_from_disk(
+  raft::resources const& res,
+  const cuvs::neighbors::hnsw::index_params& params,
+  const cuvs::neighbors::cagra::index<T, IdxT>& index_,
+  raft::host_matrix_view<const T, int64_t, raft::row_major> dataset) -> std::string
+{
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("hnsw::serialize_layered");
+  const auto total_start_time = std::chrono::steady_clock::now();
+
+  RAFT_EXPECTS(index_.graph_fd().has_value() && index_.mapping_fd().has_value(),
+               "Layered HNSW serialization requires a disk-backed ACE graph and mapping.");
+  RAFT_EXPECTS(static_cast<size_t>(dataset.extent(0)) == static_cast<size_t>(index_.size()),
+               "Dataset rows (%zu) must match index size (%zu)",
+               static_cast<size_t>(dataset.extent(0)),
+               static_cast<size_t>(index_.size()));
+  RAFT_EXPECTS(static_cast<size_t>(dataset.extent(1)) == static_cast<size_t>(index_.dim()),
+               "Dataset cols (%zu) must match index dimensions (%zu)",
+               static_cast<size_t>(dataset.extent(1)),
+               static_cast<size_t>(index_.dim()));
+
+  const auto graph_path = index_.graph_fd()->get_path();
+  RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor");
+  const auto ace_dir       = std::filesystem::path(graph_path).parent_path();
+  const auto artifact_file = ace_dir / "hnsw_index.cuvs";
+  std::filesystem::create_directories(ace_dir);
+
+  auto n_rows           = static_cast<size_t>(index_.size());
+  auto dim              = static_cast<size_t>(index_.dim());
+  auto graph_degree_int = static_cast<int>(index_.graph_degree());
+
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(index_.dim(), index_.metric(), params.hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
+
+  RAFT_LOG_INFO("Layered HNSW artifact: generating hierarchy levels");
+  const auto hierarchy_start_time = std::chrono::steady_clock::now();
+  auto hierarchy                  = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr);
+  const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: hierarchy levels generated in %ld ms (max_level=%d promoted=%zu)",
+    hierarchy_elapsed_ms,
+    hierarchy.max_level(),
+    hierarchy.promoted_count());
+
+  layered_hnsw_file_metadata metadata;
+  metadata.n_rows               = n_rows;
+  metadata.dim                  = dim;
+  metadata.M                    = appr_algo->M_;
+  metadata.maxM                 = appr_algo->maxM_;
+  metadata.maxM0                = appr_algo->maxM0_;
+  metadata.ef_construction      = appr_algo->ef_construction_;
+  metadata.mult                 = appr_algo->mult_;
+  metadata.maxlevel             = hierarchy.max_level();
+  metadata.enterpoint_node      = static_cast<int>(hierarchy.order.back());
+  metadata.base_degree          = static_cast<size_t>(graph_degree_int);
+  metadata.levels_bytes         = n_rows * sizeof(uint8_t);
+  metadata.base_nodes_bytes     = n_rows * sizeof(IdxT);
+  metadata.base_link_row_bytes  = appr_algo->size_links_level0_;
+  metadata.base_links_bytes     = n_rows * metadata.base_link_row_bytes;
+  metadata.upper_link_row_bytes = appr_algo->size_links_per_element_;
+  size_t next_upper_node_offset = 0;
+  size_t next_upper_link_offset = 0;
+  for (size_t pt_level = 1; pt_level < hierarchy.hist.size(); pt_level++) {
+    auto start_idx    = hierarchy.offsets[pt_level - 1];
+    auto row_count    = n_rows - start_idx;
+    auto layer_degree = hnsw_upper_layer_degree(row_count, appr_algo->M_);
+    metadata.layers.push_back(
+      {pt_level, row_count, layer_degree, next_upper_node_offset, next_upper_link_offset});
+    next_upper_node_offset += row_count;
+    next_upper_link_offset += row_count;
+  }
+  metadata.upper_nodes_count = next_upper_node_offset;
+  metadata.upper_nodes_bytes = metadata.upper_nodes_count * sizeof(IdxT);
+  metadata.upper_links_bytes = next_upper_link_offset * metadata.upper_link_row_bytes;
+
+  const auto header = make_layered_hnsw_header<T>(metadata, index_.metric());
+  std::vector<layered_hnsw_layer_descriptor> layer_descriptors(metadata.layers.size());
+  for (size_t i = 0; i < metadata.layers.size(); ++i) {
+    const auto& layer    = metadata.layers[i];
+    layer_descriptors[i] = {static_cast<uint64_t>(layer.level),
+                            static_cast<uint64_t>(layer.row_count),
+                            static_cast<uint64_t>(layer.degree),
+                            static_cast<uint64_t>(layer.node_offset),
+                            static_cast<uint64_t>(layer.link_offset)};
+  }
+  const auto descriptors_offset = sizeof(layered_hnsw_file_header);
+  const auto descriptors_bytes  = layer_descriptors.size() * sizeof(layered_hnsw_layer_descriptor);
+
+  const auto payload_offset =
+    align_up(descriptors_offset + descriptors_bytes, layered_hnsw_alignment);
+  const auto levels_offset      = payload_offset;
+  const auto base_nodes_offset  = levels_offset + metadata.levels_bytes;
+  const auto base_links_offset  = base_nodes_offset + metadata.base_nodes_bytes;
+  const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes;
+  const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes;
+  const auto final_file_size    = upper_links_offset + metadata.upper_links_bytes;
+
+  cuvs::util::file_descriptor artifact_fd(artifact_file.string(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+  cuvs::util::preallocate_file(artifact_fd, final_file_size);
+  cuvs::util::write_large_file(artifact_fd, &header, sizeof(header), 0);
+  if (descriptors_bytes > 0) {
+    cuvs::util::write_large_file(
+      artifact_fd, layer_descriptors.data(), descriptors_bytes, descriptors_offset);
+  }
+
+  const auto levels_start_time = std::chrono::steady_clock::now();
+  cuvs::util::write_large_file(
+    artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset);
+  const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time);
+  RAFT_LOG_INFO("Layered HNSW artifact: levels section written in %ld ms (%.2f GiB, %.2f GiB/s)",
+                levels_elapsed_ms,
+                to_gib(metadata.levels_bytes),
+                throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms));
+
+  RAFT_LOG_INFO("Layered HNSW artifact: writing hnswlib-ready base topology section");
+  const auto layer0_start_time = std::chrono::steady_clock::now();
+  write_layered_base_links_from_disk<T, IdxT>(index_,
+                                              artifact_fd,
+                                              base_nodes_offset,
+                                              base_links_offset,
+                                              metadata.base_link_row_bytes,
+                                              metadata.maxM0);
+  const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time);
+  static_cast<void>(layer0_elapsed_ms);
+  RAFT_LOG_INFO(
+    "Layered HNSW artifact: base topology section written in %ld ms (%.2f GiB, %.2f GiB/s)",
+    layer0_elapsed_ms,
+    to_gib(metadata.base_nodes_bytes + metadata.base_links_bytes),
+    throughput_gib_per_s(metadata.base_nodes_bytes + metadata.base_links_bytes, layer0_elapsed_ms));
+
+  size_t upper_graph_bytes_written = 0;
+  if (hierarchy.hist.size() > 1) {
+    RAFT_LOG_INFO("Layered HNSW artifact: gathering promoted vectors");
+    const auto gather_start_time = std::chrono::steady_clock::now();
+    auto host_query_set =
+      raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(hierarchy.promoted_count()), dim);
+#pragma omp parallel for
+    for (int64_t i = 0; i < static_cast<int64_t>(n_rows); i++) {
+      if (hierarchy.levels[i] > 0) {
+        const auto query_row = hierarchy.order_bw[i] - hierarchy.hist[0];
+        auto* dst            = host_query_set.data_handle() + query_row * dim;
+        std::copy(&dataset(i, 0), &dataset(i, 0) + dim, dst);
+      }
+    }
+    const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time);
+    const auto gathered_bytes    = hierarchy.promoted_count() * dim * sizeof(T);
+    RAFT_LOG_INFO(
+      "Layered HNSW artifact: promoted vectors gathered in %ld ms (%.2f GiB copied, %.2f GiB/s)",
+      gather_elapsed_ms,
+      to_gib(gathered_bytes),
+      throughput_gib_per_s(gathered_bytes, gather_elapsed_ms));
+
+    auto promoted_dataset = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+      host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1));
+    const auto upper_layers_start_time = std::chrono::steady_clock::now();
+    build_hnsw_upper_layer_graphs<T, IdxT>(
+      res,
+      promoted_dataset,
+      hierarchy,
+      appr_algo->M_,
+      index_.metric(),
+      [&](size_t pt_level, size_t start_idx, auto& host_neighbors) {
+        const auto& layer = metadata.layers[pt_level - 1];
+        RAFT_LOG_DEBUG("Layered HNSW artifact: writing upper layer level=%zu rows=%zu degree=%zu",
+                       layer.level,
+                       layer.row_count,
+                       layer.degree);
+        const auto layer_write_start_time = std::chrono::steady_clock::now();
+        const size_t target_batch_bytes   = 64 * 1024 * 1024;
+        const size_t row_bytes            = sizeof(IdxT) + metadata.upper_link_row_bytes;
+        const size_t batch_size           = std::max<size_t>(1, target_batch_bytes / row_bytes);
+        std::vector<IdxT> node_buffer(batch_size);
+        std::vector<char> link_buffer(batch_size * metadata.upper_link_row_bytes);
+        for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += batch_size) {
+          const auto current_batch_size = std::min(batch_size, layer.row_count - batch_start);
+          std::fill(link_buffer.begin(),
+                    link_buffer.begin() + current_batch_size * metadata.upper_link_row_bytes,
+                    0);
+        // Upper-layer rows stay in promoted (per-level) order for sequential writes. node_buffer
+        // records each node's original ID and neighbors are remapped to original IDs. The search
+        // node scatters each row into get_linklist(original_id, level) on load.
+#pragma omp parallel for
+          for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
+               ++batch_idx) {
+            const auto row         = batch_start + static_cast<size_t>(batch_idx);
+            node_buffer[batch_idx] = static_cast<IdxT>(hierarchy.order[start_idx + row]);
+            auto* link_row         = link_buffer.data() + batch_idx * metadata.upper_link_row_bytes;
+            hnswlib::linklistsizeint list_count =
+              static_cast<hnswlib::linklistsizeint>(layer.degree);
+            std::memcpy(link_row, &list_count, sizeof(list_count));
+            auto* dst = reinterpret_cast<IdxT*>(link_row + sizeof(hnswlib::linklistsizeint));
+            if (layer.degree > 0) {
+              auto* src = host_neighbors.data_handle() + row * layer.degree;
+              for (size_t j = 0; j < layer.degree; ++j) {
+                dst[j] = static_cast<IdxT>(hierarchy.order[src[j] + start_idx]);
+              }
+            }
+          }
+          cuvs::util::write_large_file(
+            artifact_fd,
+            node_buffer.data(),
+            current_batch_size * sizeof(IdxT),
+            upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(IdxT));
+          cuvs::util::write_large_file(
+            artifact_fd,
+            link_buffer.data(),
+            current_batch_size * metadata.upper_link_row_bytes,
+            upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
+        }
+        const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes);
+        upper_graph_bytes_written += layer_bytes;
+        const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time);
+        static_cast<void>(layer_write_elapsed_ms);
+        RAFT_LOG_INFO(
+          "Layered HNSW artifact: upper layer level=%zu written in %ld ms (%.2f GiB, %.2f GiB/s)",
+          layer.level,
+          layer_write_elapsed_ms,
+          to_gib(layer_bytes),
+          throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms));
+      });
+    const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time);
+    RAFT_LOG_INFO(
+      "Layered HNSW artifact: upper layers generated and written in %ld ms (%.2f GiB, %.2f "
+      "GiB/s)",
+      upper_layers_elapsed_ms,
+      to_gib(upper_graph_bytes_written),
+      throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms));
+  }
+
+  const auto total_elapsed_ms = elapsed_ms_since(total_start_time);
+  RAFT_LOG_INFO("Layered HNSW artifact: wrote %s in %ld ms (artifact=%.2f GiB, %.2f GiB/s)",
+                artifact_file.string().c_str(),
+                total_elapsed_ms,
+                to_gib(final_file_size),
+                throughput_gib_per_s(final_file_size, total_elapsed_ms));
+  return artifact_file.string();
+}
+
+// Build the standard hnswlib index header (matches HierarchicalNSW::saveIndex byte layout). Single
+// source of the on-disk header encoding, shared by the streaming serialize path and the
+// disk-to-disk materialize path so the two cannot drift.
+inline auto make_hnswlib_native_header(size_t offset_level0,
+                                       size_t n_rows,
+                                       size_t size_data_per_element,
+                                       size_t label_offset,
+                                       size_t offset_data,
+                                       int maxlevel,
+                                       int enterpoint_node,
+                                       size_t maxM,
+                                       size_t maxM0,
+                                       size_t M,
+                                       double mult,
+                                       size_t ef_construction) -> std::vector<char>
+{
+  std::vector<char> buffer;
+  auto append = [&buffer](const auto& value) {
+    const auto* bytes = reinterpret_cast<const char*>(&value);
+    buffer.insert(buffer.end(), bytes, bytes + sizeof(value));
+  };
+  const size_t max_elements      = n_rows;
+  const size_t cur_element_count = n_rows;
+  append(offset_level0);
+  append(max_elements);
+  append(cur_element_count);
+  append(size_data_per_element);
+  append(label_offset);
+  append(offset_data);
+  append(maxlevel);
+  append(enterpoint_node);
+  append(maxM);
+  append(maxM0);
+  append(M);
+  append(mult);
+  append(ef_construction);
+  return buffer;
+}
+
 template <typename T, typename IdxT>
 void serialize_to_hnswlib_from_disk(raft::resources const& res,
                                     std::ostream& os_raw,
@@ -501,44 +1372,12 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
     hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction);
 
   bool create_hierarchy = params.hierarchy != HnswHierarchy::NONE;
-
-  // create hierarchy order
-  // sort the points by levels
-  // roll dice & build histogram
-  std::vector<size_t> hist;
-  std::vector<size_t> order(n_rows);
-  std::vector<size_t> order_bw(n_rows);
-  std::vector<int> levels(n_rows);
-  std::vector<size_t> offsets;
-
-  if (create_hierarchy) {
-    RAFT_LOG_INFO("Sort points by levels");
-    for (int64_t i = 0; i < n_rows; i++) {
-      auto pt_level = appr_algo->getRandomLevel(appr_algo->mult_);
-      while (pt_level >= static_cast<int32_t>(hist.size()))
-        hist.push_back(0);
-      hist[pt_level]++;
-      levels[i] = pt_level;
-    }
-
-    // accumulate
-    offsets.resize(hist.size() + 1, 0);
-    for (size_t i = 0; i < hist.size() - 1; i++) {
-      offsets[i + 1] = offsets[i] + hist[i];
-      RAFT_LOG_INFO("Level %zu : %zu", i + 1, size_t(n_rows) - offsets[i + 1]);
-    }
-
-    // fw/bw indices
-    for (int64_t i = 0; i < n_rows; i++) {
-      auto pt_level              = levels[i];
-      order_bw[i]                = offsets[pt_level];
-      order[offsets[pt_level]++] = i;
-    }
-  }
+  hnsw_level_plan hierarchy;
+  if (create_hierarchy) { hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Level "); }
 
   // set last point of the highest level as the entry point
-  appr_algo->enterpoint_node_ = create_hierarchy ? order.back() : n_rows / 2;
-  appr_algo->maxlevel_        = create_hierarchy ? hist.size() - 1 : 1;
+  appr_algo->enterpoint_node_ = create_hierarchy ? hierarchy.order.back() : n_rows / 2;
+  appr_algo->maxlevel_        = create_hierarchy ? hierarchy.max_level() : 1;
 
   // write header information
   RAFT_LOG_DEBUG("Writing HNSW header: offsetLevel0=%zu, n_rows=%zu, size_data_per_element=%zu",
@@ -552,37 +1391,25 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
                  appr_algo->maxM0_,
                  appr_algo->M_);
 
-  // offset_level_0
-  os.write(reinterpret_cast<char*>(&appr_algo->offsetLevel0_), sizeof(std::size_t));
-  // 8 max_element - override with n_rows
-  size_t num_elements = (size_t)n_rows;
-  os.write(reinterpret_cast<char*>(&num_elements), sizeof(std::size_t));
-  // 16 curr_element_count - override with n_rows
-  os.write(reinterpret_cast<char*>(&num_elements), sizeof(std::size_t));
-  // 24 size_data_per_element
-  os.write(reinterpret_cast<char*>(&appr_algo->size_data_per_element_), sizeof(std::size_t));
-  // 32 label_offset
-  os.write(reinterpret_cast<char*>(&appr_algo->label_offset_), sizeof(std::size_t));
-  // 40 offset_data
-  os.write(reinterpret_cast<char*>(&appr_algo->offsetData_), sizeof(std::size_t));
-  // 48 maxlevel
-  os.write(reinterpret_cast<char*>(&appr_algo->maxlevel_), sizeof(int));
-  // 52 enterpoint_node
-  os.write(reinterpret_cast<char*>(&appr_algo->enterpoint_node_), sizeof(int));
-  // 56 maxM
-  os.write(reinterpret_cast<char*>(&appr_algo->maxM_), sizeof(std::size_t));
-  // 64 maxM0
-  os.write(reinterpret_cast<char*>(&appr_algo->maxM0_), sizeof(std::size_t));
-  // 72 M
-  os.write(reinterpret_cast<char*>(&appr_algo->M_), sizeof(std::size_t));
-  // 80 mult
-  os.write(reinterpret_cast<char*>(&appr_algo->mult_), sizeof(double));
-  // 88 ef_construction
-  os.write(reinterpret_cast<char*>(&appr_algo->ef_construction_), sizeof(std::size_t));
+  // Write the hnswlib index header (max_element / cur_element_count overridden with n_rows) using
+  // the shared encoder so the byte layout stays in lockstep with the materialize path.
+  const auto native_header = make_hnswlib_native_header(appr_algo->offsetLevel0_,
+                                                        static_cast<size_t>(n_rows),
+                                                        appr_algo->size_data_per_element_,
+                                                        appr_algo->label_offset_,
+                                                        appr_algo->offsetData_,
+                                                        appr_algo->maxlevel_,
+                                                        appr_algo->enterpoint_node_,
+                                                        appr_algo->maxM_,
+                                                        appr_algo->maxM0_,
+                                                        appr_algo->M_,
+                                                        appr_algo->mult_,
+                                                        appr_algo->ef_construction_);
+  os.write(native_header.data(), static_cast<std::streamsize>(native_header.size()));
 
   // host queries
   auto host_query_set =
-    raft::make_host_matrix<T, int64_t>(create_hierarchy ? n_rows - hist[0] : 0, dim);
+    raft::make_host_matrix<T, int64_t>(create_hierarchy ? hierarchy.promoted_count() : 0, dim);
 
   int64_t d_report_offset    = n_rows / 10;  // Report progress in 10% steps.
   int64_t next_report_offset = d_report_offset;
@@ -690,11 +1517,11 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
       const T* data_row = &dataset_buffer(batch_idx, 0);
       os.write(reinterpret_cast<const char*>(data_row), sizeof(T) * dim);
 
-      if (create_hierarchy && levels[i] > 0) {
+      if (create_hierarchy && hierarchy.levels[i] > 0) {
         // position in query: order_bw[i]-hist[0]
-        std::copy(data_row,
-                  data_row + dim,
-                  reinterpret_cast<char*>(&host_query_set(order_bw[i] - hist[0], 0)));
+        auto* dst = host_query_set.data_handle() +
+                    (hierarchy.order_bw[i] - hierarchy.hist[0]) * static_cast<size_t>(dim);
+        std::copy(data_row, data_row + dim, dst);
       }
 
       // assign original label
@@ -732,22 +1559,15 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
   // trigger knn builds for all levels
   std::vector<raft::host_matrix<IdxT, int64_t>> host_neighbors;
   if (create_hierarchy) {
-    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
-      auto num_pts       = n_rows - offsets[pt_level - 1];
-      auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
-      host_neighbors.emplace_back(raft::make_host_matrix<IdxT, int64_t>(num_pts, neighbor_size));
-    }
-    for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) {
-      RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level);
-      auto removed_rows = offsets[pt_level - 1] - offsets[0];
-      raft::host_matrix_view<T, int64_t, raft::row_major> sub_query_view(
-        host_query_set.data_handle() + removed_rows * dim,
-        host_query_set.extent(0) - removed_rows,
-        dim);
-      auto neighbor_view = host_neighbors[pt_level - 1].view();
-      all_neighbors_graph(
-        res, raft::make_const_mdspan(sub_query_view), neighbor_view, index_.metric());
-    }
+    host_neighbors.reserve(hierarchy.hist.size() - 1);
+    build_hnsw_upper_layer_graphs<T, IdxT>(
+      res,
+      raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+        host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)),
+      hierarchy,
+      appr_algo->M_,
+      index_.metric(),
+      [&](size_t, size_t, auto& neighbors) { host_neighbors.emplace_back(std::move(neighbors)); });
   }
 
   if (create_hierarchy) {
@@ -758,7 +1578,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
   start_clock   = std::chrono::system_clock::now();
 
   for (int64_t i = 0; i < n_rows; i++) {
-    size_t cur_level = create_hierarchy ? levels[i] : 0;
+    size_t cur_level = create_hierarchy ? hierarchy.levels[i] : 0;
     unsigned int linkListSize =
       create_hierarchy && cur_level > 0 ? appr_algo->size_links_per_element_ * cur_level : 0;
     os.write(reinterpret_cast<char*>(&linkListSize), sizeof(int));
@@ -766,14 +1586,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res,
     if (linkListSize) {
       for (size_t pt_level = 1; pt_level <= cur_level; pt_level++) {
         auto neighbor_view = host_neighbors[pt_level - 1].view();
-        auto my_row        = order_bw[i] - offsets[pt_level - 1];
+        auto my_row        = hierarchy.order_bw[i] - hierarchy.offsets[pt_level - 1];
 
-        IdxT* neighbors     = &neighbor_view(my_row, 0);
         unsigned int extent = neighbor_view.extent(1);
         os.write(reinterpret_cast<char*>(&extent), sizeof(int));
-        for (unsigned int j = 0; j < extent; j++) {
-          const IdxT converted = order[neighbors[j] + offsets[pt_level - 1]];
-          os.write(reinterpret_cast<const char*>(&converted), sizeof(IdxT));
+        if (extent > 0) {
+          IdxT* neighbors = &neighbor_view(my_row, 0);
+          for (unsigned int j = 0; j < extent; j++) {
+            const IdxT converted = hierarchy.order[neighbors[j] + hierarchy.offsets[pt_level - 1]];
+            os.write(reinterpret_cast<const char*>(&converted), sizeof(IdxT));
+          }
         }
         auto remainder = appr_algo->M_ - neighbor_view.extent(1);
         for (size_t j = 0; j < remainder; j++) {
@@ -1088,6 +1910,18 @@ std::unique_ptr<index<T>> from_cagra(
       std::filesystem::exists(index_directory) && std::filesystem::is_directory(index_directory),
       "Directory '%s' does not exist",
       index_directory.c_str());
+    if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+      RAFT_EXPECTS(dataset.has_value(),
+                   "Layered HNSW serialization requires the original-order dataset.");
+      auto artifact_path =
+        serialize_to_layered_hnsw_from_disk(res, params, cagra_index, dataset.value());
+
+      auto hnsw_index =
+        std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), params.hierarchy);
+      hnsw_index->set_file_descriptor(cuvs::util::file_descriptor(artifact_path, O_RDONLY));
+      return hnsw_index;
+    }
+
     std::string index_filename =
       (std::filesystem::path(index_directory) / "hnsw_index.bin").string();
 
@@ -1118,6 +1952,8 @@ std::unique_ptr<index<T>> from_cagra(
     return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
   } else if (params.hierarchy == HnswHierarchy::GPU) {
     return from_cagra<T, HnswHierarchy::GPU>(res, params, cagra_index, dataset);
+  } else if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    RAFT_FAIL("GPU_LAYERED_ON_DISK requires disk-backed ACE build artifacts.");
   } else {
     RAFT_FAIL("Unsupported hierarchy type");
   }
@@ -1228,6 +2064,13 @@ void serialize(raft::resources const& res, const std::string& filename, const in
                  "Disk-based index file does not exist: %s",
                  source_path.c_str());
 
+    if (idx_impl->hierarchy() == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+      copy_file_overwrite(source_path, filename);
+      RAFT_LOG_INFO(
+        "Copied layered HNSW index from %s to %s", source_path.c_str(), filename.c_str());
+      return;
+    }
+
     // Copy the file to the new location
     std::filesystem::copy_file(
       source_path, filename, std::filesystem::copy_options::overwrite_existing);
@@ -1241,6 +2084,1074 @@ void serialize(raft::resources const& res, const std::string& filename, const in
   hnswlib_index->saveIndex(filename);
 }
 
+// Parsed + fully validated view of a layered HNSW artifact: header metadata, the payload section
+// offsets, and the per-row levels array. Both the in-memory deserialize path and the disk-to-disk
+// materialize path consume this so the artifact format (offset arithmetic and the size-invariant
+// checks) lives in a single place and cannot drift between them.
+struct layered_artifact_view {
+  layered_hnsw_file_metadata metadata;
+  size_t artifact_size      = 0;
+  size_t levels_offset      = 0;
+  size_t base_nodes_offset  = 0;
+  size_t base_links_offset  = 0;
+  size_t upper_nodes_offset = 0;
+  size_t upper_links_offset = 0;
+  std::vector<uint8_t> levels;  // per-row top level, length n_rows
+};
+
+// Reads and validates the artifact header/descriptors/levels from an already-open artifact fd.
+// Enforces every structural invariant (magic, version, dtype, metric, dim, layer count, section
+// sizes, file length, and the levels max-level) BEFORE the caller allocates buffers sized from
+// these fields, so a corrupt/truncated/crafted header cannot drive an out-of-bounds read or write.
+template <typename T>
+auto read_and_validate_layered_artifact(const cuvs::util::file_descriptor& artifact_fd,
+                                        const std::string& artifact_path,
+                                        int dim,
+                                        cuvs::distance::DistanceType metric)
+  -> layered_artifact_view
+{
+  layered_hnsw_file_header header{};
+  cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0);
+  RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0,
+               "Invalid layered HNSW artifact magic: %s",
+               artifact_path.c_str());
+  RAFT_EXPECTS(header.version == layered_hnsw_version,
+               "Unsupported layered HNSW artifact version %u",
+               header.version);
+  RAFT_EXPECTS(header.dtype == static_cast<uint32_t>(layered_dtype_code<T>()),
+               "Layered HNSW artifact dtype (%s) does not match requested dtype (%s)",
+               layered_dtype_name(static_cast<layered_hnsw_dtype>(header.dtype)),
+               layered_dtype_name(layered_dtype_code<T>()));
+  RAFT_EXPECTS(header.metric == static_cast<uint32_t>(metric),
+               "Layered HNSW artifact metric (%s) does not match requested metric (%s)",
+               metric_name(static_cast<cuvs::distance::DistanceType>(header.metric)),
+               metric_name(metric));
+
+  layered_artifact_view view;
+  view.artifact_size            = static_cast<size_t>(std::filesystem::file_size(artifact_path));
+  const auto descriptors_offset = sizeof(layered_hnsw_file_header);
+  const auto descriptors_bytes =
+    static_cast<size_t>(header.num_layers) * sizeof(layered_hnsw_layer_descriptor);
+  RAFT_EXPECTS(descriptors_offset + descriptors_bytes <= view.artifact_size,
+               "Layered HNSW layer descriptors are outside artifact: offset=%zu size=%zu "
+               "artifact=%zu",
+               descriptors_offset,
+               descriptors_bytes,
+               view.artifact_size);
+
+  std::vector<layered_hnsw_layer_descriptor> layer_descriptors(header.num_layers);
+  if (descriptors_bytes > 0) {
+    cuvs::util::read_large_file(
+      artifact_fd, layer_descriptors.data(), descriptors_bytes, descriptors_offset);
+  }
+  auto& metadata = view.metadata;
+  metadata       = layered_hnsw_metadata_from_header(header);
+  metadata.layers.reserve(layer_descriptors.size());
+  for (const auto& descriptor : layer_descriptors) {
+    metadata.layers.push_back({static_cast<size_t>(descriptor.level),
+                               static_cast<size_t>(descriptor.row_count),
+                               static_cast<size_t>(descriptor.degree),
+                               static_cast<size_t>(descriptor.node_offset),
+                               static_cast<size_t>(descriptor.link_offset)});
+  }
+
+  RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row");
+  RAFT_EXPECTS(metadata.dim > 0, "Layered HNSW artifact must contain at least one dimension");
+  RAFT_EXPECTS(static_cast<size_t>(dim) == metadata.dim,
+               "Layered HNSW artifact dim (%zu) does not match requested dim (%d)",
+               metadata.dim,
+               dim);
+  RAFT_EXPECTS(metadata.layers.size() == static_cast<size_t>(metadata.maxlevel),
+               "Layered HNSW artifact has %zu upper layers, expected %d",
+               metadata.layers.size(),
+               metadata.maxlevel);
+
+  // Section-size invariants: every buffer below is sized from n_rows / upper_nodes_count, so the
+  // declared section byte counts must match exactly or a later read/scatter would overrun.
+  RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t),
+               "Layered HNSW levels section size mismatch");
+  RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t),
+               "Layered HNSW base node section size mismatch");
+  RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes,
+               "Layered HNSW base links section size mismatch");
+  RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t),
+               "Layered HNSW upper node section size mismatch");
+  RAFT_EXPECTS(
+    metadata.upper_links_bytes == metadata.upper_nodes_count * metadata.upper_link_row_bytes,
+    "Layered HNSW upper link section size mismatch");
+  RAFT_EXPECTS(metadata.base_degree <= metadata.maxM0,
+               "Layered HNSW base degree (%zu) exceeds maxM0 (%zu)",
+               metadata.base_degree,
+               metadata.maxM0);
+
+  const auto payload_offset =
+    align_up(descriptors_offset + descriptors_bytes, layered_hnsw_alignment);
+  view.levels_offset            = payload_offset;
+  view.base_nodes_offset        = view.levels_offset + metadata.levels_bytes;
+  view.base_links_offset        = view.base_nodes_offset + metadata.base_nodes_bytes;
+  view.upper_nodes_offset       = view.base_links_offset + metadata.base_links_bytes;
+  view.upper_links_offset       = view.upper_nodes_offset + metadata.upper_nodes_bytes;
+  const auto expected_file_size = view.upper_links_offset + metadata.upper_links_bytes;
+  RAFT_EXPECTS(view.artifact_size >= expected_file_size,
+               "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu",
+               expected_file_size,
+               view.artifact_size);
+
+  view.levels.resize(metadata.n_rows);
+  cuvs::util::read_large_file(
+    artifact_fd, view.levels.data(), metadata.levels_bytes, view.levels_offset);
+  const auto max_level_in_levels = *std::max_element(view.levels.begin(), view.levels.end());
+  RAFT_EXPECTS(static_cast<int>(max_level_in_levels) == metadata.maxlevel,
+               "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)",
+               static_cast<int>(max_level_in_levels),
+               metadata.maxlevel);
+  return view;
+}
+
+template <typename T>
+auto deserialize_layered_hnsw(raft::resources const& res,
+                              const index_params& params,
+                              const std::string& artifact_path,
+                              int dim,
+                              cuvs::distance::DistanceType metric) -> std::unique_ptr<index<T>>
+{
+  common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::deserialize_layered");
+  const auto total_start_time    = std::chrono::steady_clock::now();
+  const auto metadata_start_time = std::chrono::steady_clock::now();
+  cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY);
+  auto view      = read_and_validate_layered_artifact<T>(artifact_fd, artifact_path, dim, metric);
+  auto& metadata = view.metadata;
+  const auto artifact_size       = view.artifact_size;
+  const auto base_nodes_offset   = view.base_nodes_offset;
+  const auto base_links_offset   = view.base_links_offset;
+  const auto upper_nodes_offset  = view.upper_nodes_offset;
+  const auto upper_links_offset  = view.upper_links_offset;
+  const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time);
+
+  RAFT_EXPECTS(!params.dataset_path.empty(),
+               "Layered HNSW deserialization requires index_params.dataset_path");
+  RAFT_LOG_INFO("Layered HNSW load: metadata read in %ld ms (rows=%zu dim=%zu artifact=%.2f GiB)",
+                metadata_elapsed_ms,
+                metadata.n_rows,
+                metadata.dim,
+                to_gib(artifact_size));
+
+  const auto dataset_open_start_time = std::chrono::steady_clock::now();
+  auto dataset_file                  = open_layered_dataset_file<T>(params.dataset_path);
+  const auto dataset_open_elapsed_ms = elapsed_ms_since(dataset_open_start_time);
+
+  RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows &&
+                 dataset_file.shape[1] == metadata.dim,
+               "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset rows=%zu "
+               "dim=%zu path=%s",
+               metadata.n_rows,
+               metadata.dim,
+               dataset_file.shape.size() > 0 ? dataset_file.shape[0] : 0,
+               dataset_file.shape.size() > 1 ? dataset_file.shape[1] : 0,
+               params.dataset_path.c_str());
+
+  RAFT_LOG_INFO("Layered HNSW load: dataset header validated in %ld ms (%s)",
+                dataset_open_elapsed_ms,
+                params.dataset_path.c_str());
+
+  const auto dataset_total_bytes = metadata.n_rows * metadata.dim * sizeof(T);
+  const auto deserialize_progress_total_bytes =
+    metadata.levels_bytes + dataset_total_bytes + metadata.base_nodes_bytes +
+    metadata.base_links_bytes + metadata.upper_nodes_bytes + metadata.upper_links_bytes;
+  size_t deserialize_progress_bytes        = 0;
+  size_t deserialize_next_progress_percent = 10;
+  auto log_deserialize_progress            = [&](size_t bytes_loaded) {
+    if (deserialize_progress_total_bytes == 0) { return; }
+    deserialize_progress_bytes =
+      std::min(deserialize_progress_total_bytes, deserialize_progress_bytes + bytes_loaded);
+    const auto progress_percent =
+      (deserialize_progress_bytes * 100) / deserialize_progress_total_bytes;
+    const auto progress_step = std::min<size_t>(100, (progress_percent / 10) * 10);
+    if (progress_step >= deserialize_next_progress_percent) {
+      RAFT_LOG_INFO("Layered HNSW load: deserialize progress %zu%% (%.2f/%.2f GiB)",
+                    progress_step,
+                    to_gib(deserialize_progress_bytes),
+                    to_gib(deserialize_progress_total_bytes));
+      deserialize_next_progress_percent = progress_step + 10;
+    }
+  };
+
+  auto& levels_u8 = view.levels;
+  log_deserialize_progress(metadata.levels_bytes);
+
+  const auto allocation_start_time = std::chrono::steady_clock::now();
+  auto hnsw_index                  = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction);
+  appr_algo->cur_element_count = metadata.n_rows;
+  appr_algo->maxlevel_         = metadata.maxlevel;
+  appr_algo->enterpoint_node_  = metadata.enterpoint_node;
+  std::fill_n(appr_algo->linkLists_, metadata.n_rows, nullptr);
+  RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes,
+               "Layered HNSW base link row size mismatch");
+  RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes,
+               "Layered HNSW upper link row size mismatch");
+  RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM,
+               "Layered HNSW M parameter mismatch");
+  const auto allocation_elapsed_ms = elapsed_ms_since(allocation_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: allocated hnswlib storage in %ld ms (base=%.2f GiB upper=%.2f GiB)",
+    allocation_elapsed_ms,
+    to_gib(metadata.n_rows * appr_algo->size_data_per_element_),
+    to_gib(metadata.upper_nodes_count * metadata.upper_link_row_bytes));
+
+  auto num_threads =
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
+
+  const size_t target_batch_bytes = 64 * 1024 * 1024;
+  const size_t dataset_row_bytes  = metadata.dim * sizeof(T);
+  const size_t batch_size         = std::max<size_t>(1, target_batch_bytes / dataset_row_bytes);
+  auto dataset_buffer =
+    raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(batch_size), metadata.dim);
+
+  const auto base_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration dataset_read_time{};
+  std::chrono::steady_clock::duration base_copy_time{};
+  size_t dataset_bytes_read = 0;
+  for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) {
+    const auto current_batch_size    = std::min(batch_size, metadata.n_rows - batch_start);
+    const auto current_dataset_bytes = current_batch_size * metadata.dim * sizeof(T);
+    auto batch_timer                 = std::chrono::steady_clock::now();
+    cuvs::util::read_large_file(dataset_file.fd,
+                                dataset_buffer.data_handle(),
+                                current_dataset_bytes,
+                                dataset_file.header_size + batch_start * metadata.dim * sizeof(T));
+    dataset_read_time += std::chrono::steady_clock::now() - batch_timer;
+    dataset_bytes_read += current_dataset_bytes;
+
+    bool link_list_allocation_failed = false;
+    batch_timer                      = std::chrono::steady_clock::now();
+#pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto i                  = batch_start + static_cast<size_t>(batch_idx);
+      auto level                    = static_cast<int32_t>(levels_u8[i]);
+      appr_algo->element_levels_[i] = level;
+      memcpy(appr_algo->getDataByInternalId(i),
+             dataset_buffer.data_handle() + batch_idx * metadata.dim,
+             appr_algo->data_size_);
+      *appr_algo->getExternalLabeLp(i) = static_cast<hnswlib::labeltype>(i);
+      if (level > 0) {
+        const auto link_list_size = appr_algo->size_links_per_element_ * level + 1;
+        appr_algo->linkLists_[i]  = static_cast<char*>(malloc(link_list_size));
+        if (appr_algo->linkLists_[i] == nullptr) {
+          link_list_allocation_failed = true;
+          continue;
+        }
+      }
+    }
+    if (link_list_allocation_failed) {
+      for (size_t i = 0; i < metadata.n_rows; ++i) {
+        free(appr_algo->linkLists_[i]);
+        appr_algo->linkLists_[i] = nullptr;
+      }
+      throw std::runtime_error("Not enough memory to allocate HNSW upper linklists");
+    }
+    base_copy_time += std::chrono::steady_clock::now() - batch_timer;
+    log_deserialize_progress(current_dataset_bytes);
+  }
+  const auto base_elapsed_ms = elapsed_ms_since(base_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: hnswlib data and levels initialized in %ld ms "
+    "(dataset read %.2f ms %.2f GiB %.2f GiB/s, copy %.2f ms)",
+    base_elapsed_ms,
+    elapsed_ms(dataset_read_time),
+    to_gib(dataset_bytes_read),
+    throughput_gib_per_s(dataset_bytes_read, dataset_read_time),
+    elapsed_ms(base_copy_time));
+
+  const auto base_topology_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration base_topology_read_time{};
+  std::chrono::steady_clock::duration base_topology_copy_time{};
+  size_t base_topology_bytes_read      = 0;
+  const size_t base_topology_row_bytes = sizeof(uint32_t) + metadata.base_link_row_bytes;
+  const size_t base_topology_batch_size =
+    std::max<size_t>(1, target_batch_bytes / base_topology_row_bytes);
+  std::vector<uint32_t> base_node_buffer(base_topology_batch_size);
+  std::vector<char> base_link_buffer(base_topology_batch_size * metadata.base_link_row_bytes);
+  for (size_t batch_start = 0; batch_start < metadata.n_rows;
+       batch_start += base_topology_batch_size) {
+    const auto current_batch_size =
+      std::min(base_topology_batch_size, metadata.n_rows - batch_start);
+    const auto current_base_topology_bytes =
+      current_batch_size * (sizeof(uint32_t) + metadata.base_link_row_bytes);
+    auto batch_timer = std::chrono::steady_clock::now();
+    cuvs::util::read_large_file(artifact_fd,
+                                base_node_buffer.data(),
+                                current_batch_size * sizeof(uint32_t),
+                                base_nodes_offset + batch_start * sizeof(uint32_t));
+    cuvs::util::read_large_file(artifact_fd,
+                                base_link_buffer.data(),
+                                current_batch_size * metadata.base_link_row_bytes,
+                                base_links_offset + batch_start * metadata.base_link_row_bytes);
+    base_topology_read_time += std::chrono::steady_clock::now() - batch_timer;
+    base_topology_bytes_read += current_base_topology_bytes;
+
+    batch_timer          = std::chrono::steady_clock::now();
+    bool invalid_node_id = false;
+#pragma omp parallel for num_threads(num_threads) reduction(|| : invalid_node_id)
+    for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size); ++batch_idx) {
+      const auto node_id = static_cast<size_t>(base_node_buffer[batch_idx]);
+      if (node_id >= metadata.n_rows) {
+        invalid_node_id = true;
+        continue;
+      }
+      auto ll0 = appr_algo->get_linklist0(node_id);
+      memcpy(ll0,
+             base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes,
+             metadata.base_link_row_bytes);
+    }
+    RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact");
+    base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer;
+    log_deserialize_progress(current_base_topology_bytes);
+  }
+  const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: base-layer topology loaded in %ld ms "
+    "(read %.2f ms %.2f GiB %.2f GiB/s, scatter-copy %.2f ms)",
+    base_topology_elapsed_ms,
+    elapsed_ms(base_topology_read_time),
+    to_gib(base_topology_bytes_read),
+    throughput_gib_per_s(base_topology_bytes_read, base_topology_read_time),
+    elapsed_ms(base_topology_copy_time));
+
+  const auto upper_start_time = std::chrono::steady_clock::now();
+  std::chrono::steady_clock::duration upper_read_time{};
+  std::chrono::steady_clock::duration upper_copy_time{};
+  size_t upper_bytes_read = 0;
+  for (const auto& layer : metadata.layers) {
+    const auto layer_row_bytes = sizeof(uint32_t) + metadata.upper_link_row_bytes;
+    const auto layer_batch_size =
+      std::min(std::max<size_t>(1, target_batch_bytes / layer_row_bytes), layer.row_count);
+    std::vector<uint32_t> node_buffer(layer_batch_size);
+    std::vector<char> link_buffer(layer_batch_size * metadata.upper_link_row_bytes);
+    for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) {
+      const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start);
+      const auto current_upper_bytes =
+        current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes);
+      auto batch_timer = std::chrono::steady_clock::now();
+      cuvs::util::read_large_file(
+        artifact_fd,
+        node_buffer.data(),
+        current_batch_size * sizeof(uint32_t),
+        upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(uint32_t));
+      cuvs::util::read_large_file(
+        artifact_fd,
+        link_buffer.data(),
+        current_batch_size * metadata.upper_link_row_bytes,
+        upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes);
+      upper_read_time += std::chrono::steady_clock::now() - batch_timer;
+      upper_bytes_read += current_upper_bytes;
+      batch_timer             = std::chrono::steady_clock::now();
+      bool invalid_node_id    = false;
+      bool invalid_node_level = false;
+#pragma omp parallel for num_threads(num_threads) \
+  reduction(|| : invalid_node_id, invalid_node_level)
+      for (int64_t batch_idx = 0; batch_idx < static_cast<int64_t>(current_batch_size);
+           ++batch_idx) {
+        const auto node_id = static_cast<size_t>(node_buffer[batch_idx]);
+        if (node_id >= metadata.n_rows) {
+          invalid_node_id = true;
+          continue;
+        }
+        if (layer.level > static_cast<size_t>(levels_u8[node_id])) {
+          invalid_node_level = true;
+          continue;
+        }
+        auto ll = appr_algo->get_linklist(node_id, layer.level);
+        memcpy(ll,
+               link_buffer.data() + batch_idx * metadata.upper_link_row_bytes,
+               metadata.upper_link_row_bytes);
+      }
+      RAFT_EXPECTS(!invalid_node_id, "Invalid upper-layer node id in layered HNSW artifact");
+      RAFT_EXPECTS(!invalid_node_level,
+                   "Layered HNSW artifact references a node at an invalid upper level");
+      upper_copy_time += std::chrono::steady_clock::now() - batch_timer;
+      log_deserialize_progress(current_upper_bytes);
+    }
+  }
+  const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time);
+  RAFT_LOG_INFO(
+    "Layered HNSW load: upper layers loaded in %ld ms "
+    "(read %.2f ms %.2f GiB %.2f GiB/s, validate+copy %.2f ms)",
+    upper_elapsed_ms,
+    elapsed_ms(upper_read_time),
+    to_gib(upper_bytes_read),
+    throughput_gib_per_s(upper_bytes_read, upper_read_time),
+    elapsed_ms(upper_copy_time));
+
+  hnsw_index->set_index(std::move(appr_algo));
+  RAFT_LOG_INFO("Layered HNSW load: total deserialize completed in %ld ms",
+                elapsed_ms_since(total_start_time));
+  return hnsw_index;
+}
+
+// Disk-to-disk materialization: layered HNSW artifact -> standard hnswlib index file.
+// Constants/offsets shared by the materialization helpers.
+struct hnswlib_materialize_layout {
+  size_t n_rows                = 0;
+  size_t dim                   = 0;
+  size_t base_link_row_bytes   = 0;
+  size_t upper_link_row_bytes  = 0;
+  size_t size_data_per_element = 0;
+  size_t offset_data           = 0;
+  size_t label_offset          = 0;
+  size_t data_size             = 0;
+  // Output file offsets.
+  size_t base_region_offset  = 0;
+  size_t upper_region_offset = 0;
+  // Artifact payload offsets.
+  size_t base_nodes_offset  = 0;
+  size_t base_links_offset  = 0;
+  size_t upper_nodes_offset = 0;
+  size_t upper_links_offset = 0;
+};
+
+// Routes fixed-size, ID-keyed records into per-bucket temporary files and replays them per bucket.
+// Used to reorder the (small) base/upper topology under a bounded host-memory budget while keeping
+// all disk access sequential.
+struct id_record_spiller {
+  std::filesystem::path dir;
+  size_t rows_per_bucket;
+  size_t num_buckets;
+  size_t record_bytes;
+  size_t buffer_cap;
+  std::vector<cuvs::util::file_descriptor> fds;
+  std::vector<size_t> offsets;
+  std::vector<std::vector<char>> buffers;
+  // One mutex per bucket so distinct buckets can be appended concurrently (add() is called from
+  // parallel scatter threads); same-bucket appends serialize on their own mutex.
+  std::vector<std::mutex> bucket_mutexes;
+
+  id_record_spiller(std::filesystem::path dir_,
+                    size_t num_buckets_,
+                    size_t rows_per_bucket_,
+                    size_t record_bytes_,
+                    size_t total_buffer_budget)
+    : dir(std::move(dir_)),
+      rows_per_bucket(rows_per_bucket_),
+      num_buckets(num_buckets_),
+      record_bytes(record_bytes_),
+      offsets(num_buckets_, 0),
+      buffers(num_buckets_),
+      bucket_mutexes(num_buckets_)
+  {
+    std::filesystem::create_directories(dir);
+    fds.reserve(num_buckets);
+    for (size_t b = 0; b < num_buckets; ++b) {
+      fds.emplace_back((dir / ("bucket_" + std::to_string(b) + ".tmp")).string(),
+                       O_CREAT | O_RDWR | O_TRUNC,
+                       0644);
+    }
+    // Floor the per-bucket buffer so a tiny budget (many buckets) still batches many records per
+    // flush instead of degrading into one write syscall per record. This may exceed the nominal
+    // budget for pathologically small budgets, but keeps disk I/O coarse-grained and sequential.
+    constexpr size_t kMinRecordsPerFlush = 256;
+    const size_t even_share              = total_buffer_budget / std::max<size_t>(1, num_buckets);
+    buffer_cap                           = std::max(record_bytes * kMinRecordsPerFlush, even_share);
+  }
+
+  id_record_spiller(const id_record_spiller&)            = delete;
+  id_record_spiller& operator=(const id_record_spiller&) = delete;
+  id_record_spiller(id_record_spiller&&)                 = delete;
+  id_record_spiller& operator=(id_record_spiller&&)      = delete;
+
+  ~id_record_spiller() noexcept
+  {
+    fds.clear();
+    std::error_code ec;
+    std::filesystem::remove_all(dir, ec);
+  }
+
+  void flush(size_t b)
+  {
+    if (buffers[b].empty()) { return; }
+    cuvs::util::write_large_file(fds[b], buffers[b].data(), buffers[b].size(), offsets[b]);
+    offsets[b] += buffers[b].size();
+    buffers[b].clear();
+  }
+
+  void add(size_t bucket, const void* record)
+  {
+    std::lock_guard<std::mutex> guard(bucket_mutexes[bucket]);
+    auto& buf = buffers[bucket];
+    if (buf.size() + record_bytes > buffer_cap) { flush(bucket); }
+    const auto* p = reinterpret_cast<const char*>(record);
+    buf.insert(buf.end(), p, p + record_bytes);
+  }
+
+  void finish_writes()
+  {
+    for (size_t b = 0; b < num_buckets; ++b) {
+      flush(b);
+    }
+  }
+
+  template <class F>
+  void replay(size_t b, std::vector<char>& chunk, F&& consume)
+  {
+    const size_t total          = offsets[b];
+    const size_t recs_per_chunk = std::max<size_t>(1, (size_t{64} << 20) / record_bytes);
+    chunk.resize(recs_per_chunk * record_bytes);
+    size_t read_off = 0;
+    while (read_off < total) {
+      const size_t bytes = std::min(chunk.size(), total - read_off);
+      cuvs::util::read_large_file(fds[b], chunk.data(), bytes, read_off);
+      const size_t nrec = bytes / record_bytes;
+      for (size_t r = 0; r < nrec; ++r) {
+        consume(chunk.data() + r * record_bytes);
+      }
+      read_off += bytes;
+    }
+  }
+};
+
+// Streams the base topology (base_nodes + base_links) in artifact (ACE) order, invoking
+// `cb(original_id, link_row_ptr)` for each row. When `parallel_threads > 1` the callback is invoked
+// concurrently and must be thread-safe for distinct IDs.
+template <typename IdxT, class F>
+void scatter_layered_base_links(const cuvs::util::file_descriptor& artifact_fd,
+                                const hnswlib_materialize_layout& layout,
+                                int parallel_threads,
+                                F&& cb)
+{
+  const size_t n_rows = layout.n_rows;
+  const size_t row    = layout.base_link_row_bytes;
+  const size_t batch  = std::max<size_t>(1, (size_t{64} << 20) / (sizeof(IdxT) + row));
+  std::vector<IdxT> node_buf(batch);
+  std::vector<char> link_buf(batch * row);
+  for (size_t s = 0; s < n_rows; s += batch) {
+    const size_t cur = std::min(batch, n_rows - s);
+    cuvs::util::read_large_file(artifact_fd,
+                                node_buf.data(),
+                                cur * sizeof(IdxT),
+                                layout.base_nodes_offset + s * sizeof(IdxT));
+    cuvs::util::read_large_file(
+      artifact_fd, link_buf.data(), cur * row, layout.base_links_offset + s * row);
+    bool invalid = false;
+    if (parallel_threads > 1) {
+#pragma omp parallel for num_threads(parallel_threads) reduction(|| : invalid)
+      for (int64_t k = 0; k < static_cast<int64_t>(cur); ++k) {
+        const size_t id = static_cast<size_t>(node_buf[k]);
+        if (id >= n_rows) {
+          invalid = true;
+          continue;
+        }
+        cb(id, link_buf.data() + static_cast<size_t>(k) * row);
+      }
+    } else {
+      for (size_t k = 0; k < cur; ++k) {
+        const size_t id = static_cast<size_t>(node_buf[k]);
+        if (id >= n_rows) {
+          invalid = true;
+          break;
+        }
+        cb(id, link_buf.data() + k * row);
+      }
+    }
+    RAFT_EXPECTS(!invalid, "Invalid base-layer node id in layered HNSW artifact");
+  }
+}
+
+// Streams the upper layers, invoking `cb(original_id, level, link_row_ptr)` for every promoted row.
+template <class F>
+void scatter_layered_upper_links(const cuvs::util::file_descriptor& artifact_fd,
+                                 const hnswlib_materialize_layout& layout,
+                                 const std::vector<layered_hnsw_layer_info>& layers,
+                                 const std::vector<uint8_t>& levels_u8,
+                                 F&& cb)
+{
+  const size_t urow   = layout.upper_link_row_bytes;
+  const size_t n_rows = layout.n_rows;
+  for (const auto& layer : layers) {
+    const size_t rc = layer.row_count;
+    if (rc == 0) { continue; }
+    const size_t batch = std::max<size_t>(1, (size_t{64} << 20) / (sizeof(uint32_t) + urow));
+    std::vector<uint32_t> nodes(std::min(batch, rc));
+    std::vector<char> links(std::min(batch, rc) * urow);
+    for (size_t s = 0; s < rc; s += batch) {
+      const size_t cur = std::min(batch, rc - s);
+      cuvs::util::read_large_file(
+        artifact_fd,
+        nodes.data(),
+        cur * sizeof(uint32_t),
+        layout.upper_nodes_offset + (layer.node_offset + s) * sizeof(uint32_t));
+      cuvs::util::read_large_file(artifact_fd,
+                                  links.data(),
+                                  cur * urow,
+                                  layout.upper_links_offset + (layer.link_offset + s) * urow);
+      for (size_t r = 0; r < cur; ++r) {
+        const size_t id = static_cast<size_t>(nodes[r]);
+        RAFT_EXPECTS(id < n_rows, "Invalid upper-layer node id in layered HNSW artifact");
+        RAFT_EXPECTS(layer.level <= static_cast<size_t>(levels_u8[id]),
+                     "Layered HNSW artifact references a node at an invalid upper level");
+        cb(id, layer.level, links.data() + r * urow);
+      }
+    }
+  }
+}
+
+// Emits the level-0 region `[link block | vector | label]` per ID, in increasing ID order, reading
+// the dataset sequentially and writing the output sequentially. `row_ptr(id)` returns a pointer to
+// the ID's level-0 link block in a caller-owned, bounded buffer.
+template <class DatasetBatch, class RowPtr>
+void emit_hnswlib_base_records(const npy_file& dataset_file,
+                               const cuvs::util::file_descriptor& output_fd,
+                               const hnswlib_materialize_layout& layout,
+                               int num_threads,
+                               size_t id_begin,
+                               size_t id_end,
+                               std::vector<char>& out_buffer,
+                               DatasetBatch& dataset_batch,
+                               RowPtr&& row_ptr)
+{
+  const size_t spe            = layout.size_data_per_element;
+  const size_t row            = layout.base_link_row_bytes;
+  const size_t data_size      = layout.data_size;
+  const size_t out_batch_rows = static_cast<size_t>(dataset_batch.extent(0));
+  for (size_t s = id_begin; s < id_end; s += out_batch_rows) {
+    const size_t cur = std::min(out_batch_rows, id_end - s);
+    cuvs::util::read_large_file(dataset_file.fd,
+                                dataset_batch.data_handle(),
+                                cur * data_size,
+                                dataset_file.header_size + s * data_size);
+#pragma omp parallel for num_threads(num_threads)
+    for (int64_t k = 0; k < static_cast<int64_t>(cur); ++k) {
+      const size_t id = s + static_cast<size_t>(k);
+      char* rec       = out_buffer.data() + static_cast<size_t>(k) * spe;
+      std::memcpy(rec, row_ptr(id), row);
+      std::memcpy(rec + layout.offset_data,
+                  dataset_batch.data_handle() + static_cast<size_t>(k) * layout.dim,
+                  data_size);
+      const hnswlib::labeltype label = static_cast<hnswlib::labeltype>(id);
+      std::memcpy(rec + layout.label_offset, &label, sizeof(label));
+    }
+    cuvs::util::write_large_file(
+      output_fd, out_buffer.data(), cur * spe, layout.base_region_offset + s * spe);
+  }
+}
+
+// Phase 1 + 2: reorder the base topology to original-ID order and emit the level-0 region.
+template <typename T, typename IdxT>
+void materialize_hnswlib_base_region(const cuvs::util::file_descriptor& artifact_fd,
+                                     const npy_file& dataset_file,
+                                     const cuvs::util::file_descriptor& output_fd,
+                                     const layered_hnsw_file_metadata& metadata,
+                                     const hnswlib_materialize_layout& layout,
+                                     size_t budget_bytes,
+                                     int num_threads,
+                                     const std::filesystem::path& tmp_dir)
+{
+  const size_t n_rows = layout.n_rows;
+  const size_t row    = layout.base_link_row_bytes;
+
+  const size_t out_batch_rows =
+    std::max<size_t>(1, (size_t{64} << 20) / layout.size_data_per_element);
+  auto dataset_batch = raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(out_batch_rows),
+                                                          static_cast<int64_t>(metadata.dim));
+  std::vector<char> out_buffer(out_batch_rows * layout.size_data_per_element);
+
+  // Tracks that every original ID in [0, n_rows) is produced exactly once by the reorder. The
+  // section-size invariant (validated in Phase 0) guarantees exactly n_rows base-node records, so
+  // a malformed artifact with a missing/duplicate ID would otherwise leave a zero-initialized link
+  // row in the output and silently corrupt the graph; catch it instead of shipping a bad index.
+  std::vector<uint8_t> seen(n_rows, 0);
+  auto verify_full_coverage = [&]() {
+    const auto covered = static_cast<size_t>(std::count(seen.begin(), seen.end(), uint8_t{1}));
+    RAFT_EXPECTS(covered == n_rows,
+                 "Layered HNSW base nodes cover %zu of %zu original ids (missing or duplicate ids)",
+                 covered,
+                 n_rows);
+  };
+
+  const auto base_start_time = std::chrono::steady_clock::now();
+  // Single in-memory pass when the whole base topology fits the budget (no temporary files).
+  if (budget_bytes >= metadata.base_links_bytes) {
+    std::vector<char> ordered_base(metadata.base_links_bytes);
+    scatter_layered_base_links<IdxT>(
+      artifact_fd, layout, num_threads, [&](size_t id, const char* link_row) {
+        std::memcpy(ordered_base.data() + id * row, link_row, row);
+        seen[id] = 1;
+      });
+    verify_full_coverage();
+    emit_hnswlib_base_records(dataset_file,
+                              output_fd,
+                              layout,
+                              num_threads,
+                              0,
+                              n_rows,
+                              out_buffer,
+                              dataset_batch,
+                              [&](size_t id) { return ordered_base.data() + id * row; });
+    RAFT_LOG_INFO("hnswlib materialize: base region written (single-pass) in %ld ms (%.2f GiB)",
+                  elapsed_ms_since(base_start_time),
+                  to_gib(layout.size_data_per_element * n_rows));
+    return;
+  }
+
+  // Bucketed reorder through temporary files for a hard memory budget.
+  const size_t record_bytes = sizeof(IdxT) + row;
+  size_t rows_per_bucket    = std::max<size_t>(1, budget_bytes / record_bytes);
+  rows_per_bucket           = std::min(rows_per_bucket, n_rows);
+  const size_t num_buckets  = (n_rows + rows_per_bucket - 1) / rows_per_bucket;
+  RAFT_LOG_INFO(
+    "hnswlib materialize: base region uses %zu buckets (rows/bucket=%zu, budget=%.2f GiB)",
+    num_buckets,
+    rows_per_bucket,
+    to_gib(budget_bytes));
+
+  id_record_spiller spiller(
+    tmp_dir / "base", num_buckets, rows_per_bucket, record_bytes, budget_bytes / 2);
+  // The spiller's per-bucket mutexes make add() safe under the parallel scatter; each thread stages
+  // the record in its own (thread_local) buffer before the routed append.
+  scatter_layered_base_links<IdxT>(
+    artifact_fd, layout, num_threads, [&](size_t id, const char* link_row) {
+      thread_local std::vector<char> rec;
+      rec.resize(record_bytes);
+      const IdxT id32 = static_cast<IdxT>(id);
+      std::memcpy(rec.data(), &id32, sizeof(IdxT));
+      std::memcpy(rec.data() + sizeof(IdxT), link_row, row);
+      spiller.add(id / rows_per_bucket, rec.data());
+      seen[id] = 1;
+    });
+  spiller.finish_writes();
+  verify_full_coverage();
+
+  std::vector<char> bucket_rows;
+  std::vector<char> chunk;
+  for (size_t b = 0; b < num_buckets; ++b) {
+    const size_t id_begin = b * rows_per_bucket;
+    const size_t id_end   = std::min(n_rows, id_begin + rows_per_bucket);
+    const size_t rib      = id_end - id_begin;
+    bucket_rows.assign(rib * row, 0);
+    spiller.replay(b, chunk, [&](const char* r) {
+      IdxT id32;
+      std::memcpy(&id32, r, sizeof(IdxT));
+      const size_t id = static_cast<size_t>(id32);
+      std::memcpy(bucket_rows.data() + (id - id_begin) * row, r + sizeof(IdxT), row);
+    });
+    emit_hnswlib_base_records(
+      dataset_file,
+      output_fd,
+      layout,
+      num_threads,
+      id_begin,
+      id_end,
+      out_buffer,
+      dataset_batch,
+      [&](size_t id) { return bucket_rows.data() + (id - id_begin) * row; });
+  }
+  RAFT_LOG_INFO("hnswlib materialize: base region written (%zu buckets) in %ld ms (%.2f GiB)",
+                num_buckets,
+                elapsed_ms_since(base_start_time),
+                to_gib(layout.size_data_per_element * n_rows));
+}
+
+// Phase 3: transpose the upper layers into per-element link lists and emit the upper region.
+inline void materialize_hnswlib_upper_region(const cuvs::util::file_descriptor& artifact_fd,
+                                             const cuvs::util::file_descriptor& output_fd,
+                                             const layered_hnsw_file_metadata& metadata,
+                                             const hnswlib_materialize_layout& layout,
+                                             const std::vector<uint8_t>& levels_u8,
+                                             size_t budget_bytes,
+                                             const std::filesystem::path& tmp_dir)
+{
+  const size_t n_rows         = layout.n_rows;
+  const size_t urow           = layout.upper_link_row_bytes;
+  const auto upper_start_time = std::chrono::steady_clock::now();
+
+  // Sequential writer over the (variable-length) upper region.
+  size_t write_off = layout.upper_region_offset;
+  std::vector<char> out_buffer;
+  out_buffer.reserve((size_t{64} << 20) + urow + sizeof(int));
+  auto flush_out = [&]() {
+    if (out_buffer.empty()) { return; }
+    cuvs::util::write_large_file(output_fd, out_buffer.data(), out_buffer.size(), write_off);
+    write_off += out_buffer.size();
+    out_buffer.clear();
+  };
+  auto append_element = [&](size_t level, const char* rows) {
+    const int link_list_size = level > 0 ? static_cast<int>(level * urow) : 0;
+    const auto* p            = reinterpret_cast<const char*>(&link_list_size);
+    out_buffer.insert(out_buffer.end(), p, p + sizeof(int));
+    if (level > 0) { out_buffer.insert(out_buffer.end(), rows, rows + level * urow); }
+    if (out_buffer.size() >= (size_t{64} << 20)) { flush_out(); }
+  };
+
+  const size_t fits_budget = metadata.upper_links_bytes + (n_rows + 1) * sizeof(size_t);
+  if (metadata.upper_links_bytes == 0 || fits_budget <= budget_bytes) {
+    // Pack all upper rows in original-ID order, then stream them out.
+    std::vector<size_t> packed_start(n_rows + 1, 0);
+    for (size_t i = 0; i < n_rows; ++i) {
+      packed_start[i + 1] = packed_start[i] + static_cast<size_t>(levels_u8[i]);
+    }
+    RAFT_EXPECTS(packed_start[n_rows] == metadata.upper_nodes_count,
+                 "Layered HNSW upper rows (%zu) do not match upper_nodes_count (%zu)",
+                 packed_start[n_rows],
+                 metadata.upper_nodes_count);
+    std::vector<char> packed(metadata.upper_links_bytes);
+    scatter_layered_upper_links(artifact_fd,
+                                layout,
+                                metadata.layers,
+                                levels_u8,
+                                [&](size_t id, size_t level, const char* link_row) {
+                                  const size_t dst_row = packed_start[id] + (level - 1);
+                                  std::memcpy(packed.data() + dst_row * urow, link_row, urow);
+                                });
+    size_t cursor = 0;
+    for (size_t id = 0; id < n_rows; ++id) {
+      const size_t level = static_cast<size_t>(levels_u8[id]);
+      append_element(level, level > 0 ? packed.data() + cursor * urow : nullptr);
+      cursor += level;
+    }
+    flush_out();
+    RAFT_LOG_INFO("hnswlib materialize: upper region written (single-pass) in %ld ms (%.2f GiB)",
+                  elapsed_ms_since(upper_start_time),
+                  to_gib(metadata.upper_links_bytes));
+    return;
+  }
+
+  // Bucketed upper transpose for a hard memory budget.
+  const size_t budget_half = std::max<size_t>(1, budget_bytes / 2);
+  size_t num_buckets =
+    std::max<size_t>(1, (metadata.upper_links_bytes + budget_half - 1) / budget_half);
+  size_t rows_per_bucket = std::max<size_t>(1, (n_rows + num_buckets - 1) / num_buckets);
+  num_buckets            = (n_rows + rows_per_bucket - 1) / rows_per_bucket;
+  RAFT_LOG_INFO("hnswlib materialize: upper region uses %zu buckets (rows/bucket=%zu)",
+                num_buckets,
+                rows_per_bucket);
+
+  const size_t record_bytes = 2 * sizeof(uint32_t) + urow;  // [id][level][row]
+  id_record_spiller spiller(
+    tmp_dir / "upper", num_buckets, rows_per_bucket, record_bytes, budget_half);
+  std::vector<char> rec(record_bytes);
+  scatter_layered_upper_links(artifact_fd,
+                              layout,
+                              metadata.layers,
+                              levels_u8,
+                              [&](size_t id, size_t level, const char* link_row) {
+                                const uint32_t id32 = static_cast<uint32_t>(id);
+                                const uint32_t lv32 = static_cast<uint32_t>(level);
+                                std::memcpy(rec.data(), &id32, sizeof(uint32_t));
+                                std::memcpy(rec.data() + sizeof(uint32_t), &lv32, sizeof(uint32_t));
+                                std::memcpy(rec.data() + 2 * sizeof(uint32_t), link_row, urow);
+                                spiller.add(id / rows_per_bucket, rec.data());
+                              });
+  spiller.finish_writes();
+
+  std::vector<char> chunk;
+  for (size_t b = 0; b < num_buckets; ++b) {
+    const size_t id_begin = b * rows_per_bucket;
+    const size_t id_end   = std::min(n_rows, id_begin + rows_per_bucket);
+    const size_t rib      = id_end - id_begin;
+    std::vector<size_t> local_start(rib + 1, 0);
+    for (size_t i = 0; i < rib; ++i) {
+      local_start[i + 1] = local_start[i] + static_cast<size_t>(levels_u8[id_begin + i]);
+    }
+    std::vector<char> packed(local_start[rib] * urow);
+    spiller.replay(b, chunk, [&](const char* r) {
+      uint32_t id32 = 0;
+      uint32_t lv32 = 0;
+      std::memcpy(&id32, r, sizeof(uint32_t));
+      std::memcpy(&lv32, r + sizeof(uint32_t), sizeof(uint32_t));
+      const size_t local   = static_cast<size_t>(id32) - id_begin;
+      const size_t dst_row = local_start[local] + (static_cast<size_t>(lv32) - 1);
+      std::memcpy(packed.data() + dst_row * urow, r + 2 * sizeof(uint32_t), urow);
+    });
+    size_t cursor = 0;
+    for (size_t i = 0; i < rib; ++i) {
+      const size_t level = static_cast<size_t>(levels_u8[id_begin + i]);
+      append_element(level, level > 0 ? packed.data() + cursor * urow : nullptr);
+      cursor += level;
+    }
+  }
+  flush_out();
+  RAFT_LOG_INFO("hnswlib materialize: upper region written (%zu buckets) in %ld ms (%.2f GiB)",
+                num_buckets,
+                elapsed_ms_since(upper_start_time),
+                to_gib(metadata.upper_links_bytes));
+}
+
+// Materialize a layered HNSW artifact + dataset into a standard hnswlib index file on disk, using
+// bounded host memory and sequential disk I/O.
+template <typename T, typename IdxT = uint32_t>
+void materialize_layered_to_hnswlib_on_disk(raft::resources const& res,
+                                            const cuvs::util::file_descriptor& artifact_fd,
+                                            const cuvs::neighbors::hnsw::materialize_params& params,
+                                            const std::string& layered_artifact_path,
+                                            const std::string& output_path,
+                                            int dim,
+                                            cuvs::distance::DistanceType metric)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>, "Layered HNSW artifacts store ids as uint32_t");
+  common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::materialize_to_hnswlib");
+  const auto total_start_time = std::chrono::steady_clock::now();
+
+  // ---- Phase 0: read and validate the artifact header, descriptors and levels ----
+  // Shared with the in-memory deserialize path so the format/offset arithmetic and every
+  // size-invariant check live in one place; this also bounds the buffer allocations below.
+  auto view =
+    read_and_validate_layered_artifact<T>(artifact_fd, layered_artifact_path, dim, metric);
+  auto& metadata                = view.metadata;
+  auto& levels_u8               = view.levels;
+  const auto base_nodes_offset  = view.base_nodes_offset;
+  const auto base_links_offset  = view.base_links_offset;
+  const auto upper_nodes_offset = view.upper_nodes_offset;
+  const auto upper_links_offset = view.upper_links_offset;
+  RAFT_EXPECTS(!params.dataset_path.empty(),
+               "Layered HNSW materialization requires materialize_params.dataset_path");
+
+  const size_t n_rows = metadata.n_rows;
+
+  // Validate the dataset header.
+  auto dataset_file = open_layered_dataset_file<T>(params.dataset_path);
+  RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == n_rows &&
+                 dataset_file.shape[1] == metadata.dim,
+               "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset path=%s",
+               n_rows,
+               metadata.dim,
+               params.dataset_path.c_str());
+
+  // Retrieve the hnswlib layout constants from a dummy (single-element) index.
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, HnswHierarchy::CPU);
+  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), 1, metadata.M, metadata.ef_construction);
+  const size_t data_size = static_cast<size_t>(dim) * sizeof(T);
+  RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes,
+               "Layered HNSW base link row size mismatch");
+  RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes,
+               "Layered HNSW upper link row size mismatch");
+  RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM,
+               "Layered HNSW M parameter mismatch");
+  RAFT_EXPECTS(appr_algo->data_size_ == data_size, "Layered HNSW data size mismatch");
+  RAFT_EXPECTS(appr_algo->offsetData_ == appr_algo->size_links_level0_,
+               "Unexpected hnswlib data offset");
+  RAFT_EXPECTS(appr_algo->label_offset_ == appr_algo->size_links_level0_ + data_size,
+               "Unexpected hnswlib label offset");
+
+  // ---- Output layout ----
+  const auto native_header = make_hnswlib_native_header(appr_algo->offsetLevel0_,
+                                                        n_rows,
+                                                        appr_algo->size_data_per_element_,
+                                                        appr_algo->label_offset_,
+                                                        appr_algo->offsetData_,
+                                                        metadata.maxlevel,
+                                                        metadata.enterpoint_node,
+                                                        metadata.maxM,
+                                                        metadata.maxM0,
+                                                        metadata.M,
+                                                        metadata.mult,
+                                                        metadata.ef_construction);
+  hnswlib_materialize_layout layout;
+  layout.n_rows                = n_rows;
+  layout.dim                   = metadata.dim;
+  layout.base_link_row_bytes   = metadata.base_link_row_bytes;
+  layout.upper_link_row_bytes  = metadata.upper_link_row_bytes;
+  layout.size_data_per_element = appr_algo->size_data_per_element_;
+  layout.offset_data           = appr_algo->offsetData_;
+  layout.label_offset          = appr_algo->label_offset_;
+  layout.data_size             = data_size;
+  layout.base_region_offset    = native_header.size();
+  layout.base_nodes_offset     = base_nodes_offset;
+  layout.base_links_offset     = base_links_offset;
+  layout.upper_nodes_offset    = upper_nodes_offset;
+  layout.upper_links_offset    = upper_links_offset;
+
+  const size_t base_region_bytes = n_rows * layout.size_data_per_element;
+  layout.upper_region_offset     = layout.base_region_offset + base_region_bytes;
+
+  size_t upper_region_bytes = 0;
+  for (size_t i = 0; i < n_rows; ++i) {
+    upper_region_bytes += sizeof(int);
+    const size_t level = static_cast<size_t>(levels_u8[i]);
+    if (level > 0) { upper_region_bytes += level * layout.upper_link_row_bytes; }
+  }
+  const size_t final_size = layout.upper_region_offset + upper_region_bytes;
+
+  cuvs::util::file_descriptor output_fd(output_path, O_CREAT | O_RDWR | O_TRUNC, 0644);
+  cuvs::util::preallocate_file(output_fd, final_size);
+  cuvs::util::write_large_file(output_fd, native_header.data(), native_header.size(), 0);
+  RAFT_LOG_INFO(
+    "hnswlib materialize: writing index (rows=%zu dim=%zu, output=%.2f GiB, dataset=%s)",
+    n_rows,
+    metadata.dim,
+    to_gib(final_size),
+    params.dataset_path.c_str());
+
+  auto num_threads =
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
+  // max_host_memory_gb <= 0 => no host-memory cap: both regions take their single in-memory pass
+  // and create no temporary files. Otherwise reserve headroom for the fixed-size streaming I/O
+  // buffers (dataset batch, output buffer, per-call scatter read buffers, each ~64 MiB) so peak
+  // host memory stays close to the requested budget rather than overshooting it by the buffer
+  // overhead.
+  size_t budget_bytes = std::numeric_limits<size_t>::max();
+  if (params.max_host_memory_gb > 0.0) {
+    const size_t requested =
+      std::max<size_t>(1, static_cast<size_t>(params.max_host_memory_gb * (size_t{1} << 30)));
+    constexpr size_t kStreamingBufferReserve = size_t{4} * (size_t{64} << 20);  // ~256 MiB
+    budget_bytes = requested > kStreamingBufferReserve ? requested - kStreamingBufferReserve
+                                                       : std::max<size_t>(1, requested / 2);
+  }
+
+  std::filesystem::path tmp_dir = output_path;
+  tmp_dir += ".materialize_tmp";
+
+  materialize_hnswlib_base_region<T, IdxT>(
+    artifact_fd, dataset_file, output_fd, metadata, layout, budget_bytes, num_threads, tmp_dir);
+  materialize_hnswlib_upper_region(
+    artifact_fd, output_fd, metadata, layout, levels_u8, budget_bytes, tmp_dir);
+
+  std::error_code ec;
+  std::filesystem::remove_all(tmp_dir, ec);
+
+  RAFT_LOG_INFO("hnswlib materialize: completed in %ld ms (output %.2f GiB)",
+                elapsed_ms_since(total_start_time),
+                to_gib(final_size));
+}
+
+// Reads the dtype tag from the artifact header and dispatches to the typed implementation.
+inline void materialize_layered_to_hnswlib_on_disk_dispatch(
+  raft::resources const& res,
+  const cuvs::neighbors::hnsw::materialize_params& params,
+  const std::string& layered_artifact_path,
+  const std::string& output_path,
+  int dim,
+  cuvs::distance::DistanceType metric)
+{
+  // Open the artifact once and read just the dtype tag to pick T; the typed worker reuses this fd
+  // (no second open) and performs full header validation via read_and_validate_layered_artifact.
+  cuvs::util::file_descriptor artifact_fd(layered_artifact_path, O_RDONLY);
+  layered_hnsw_file_header header{};
+  cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0);
+  RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0,
+               "Invalid layered HNSW artifact magic: %s",
+               layered_artifact_path.c_str());
+  switch (static_cast<layered_hnsw_dtype>(header.dtype)) {
+    case layered_hnsw_dtype::float32:
+      materialize_layered_to_hnswlib_on_disk<float>(
+        res, artifact_fd, params, layered_artifact_path, output_path, dim, metric);
+      break;
+    case layered_hnsw_dtype::float16:
+      materialize_layered_to_hnswlib_on_disk<half>(
+        res, artifact_fd, params, layered_artifact_path, output_path, dim, metric);
+      break;
+    case layered_hnsw_dtype::uint8:
+      materialize_layered_to_hnswlib_on_disk<uint8_t>(
+        res, artifact_fd, params, layered_artifact_path, output_path, dim, metric);
+      break;
+    case layered_hnsw_dtype::int8:
+      materialize_layered_to_hnswlib_on_disk<int8_t>(
+        res, artifact_fd, params, layered_artifact_path, output_path, dim, metric);
+      break;
+    default: RAFT_FAIL("Unsupported layered HNSW artifact dtype %u", header.dtype);
+  }
+}
+
 template <typename T>
 void deserialize(raft::resources const& res,
                  const index_params& params,
@@ -1249,6 +3160,12 @@ void deserialize(raft::resources const& res,
                  cuvs::distance::DistanceType metric,
                  index<T>** idx)
 {
+  if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    auto hnsw_index = deserialize_layered_hnsw<T>(res, params, filename, dim, metric);
+    *idx            = hnsw_index.release();
+    return;
+  }
+
   try {
     auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
     auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
@@ -1286,6 +3203,13 @@ std::unique_ptr<index<T>> build(raft::resources const& res,
       ? std::get<graph_build_params::ace_params>(params.graph_build_params)
       : graph_build_params::ace_params{};
 
+  if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) {
+    RAFT_EXPECTS(ace_params.use_disk,
+                 "GPU_LAYERED_ON_DISK requires ACE disk mode (ace_params.use_disk = true)");
+    RAFT_EXPECTS(!ace_params.build_dir.empty(),
+                 "GPU_LAYERED_ON_DISK requires ace_params.build_dir to be set");
+  }
+
   // Create CAGRA index parameters from HNSW parameters
   cuvs::neighbors::cagra::index_params cagra_params;
   cagra_params.metric                    = params.metric;
@@ -1302,15 +3226,14 @@ std::unique_ptr<index<T>> build(raft::resources const& res,
   cagra_ace_params.max_gpu_memory_gb  = ace_params.max_gpu_memory_gb;
   cagra_params.graph_build_params     = cagra_ace_params;
 
-  RAFT_LOG_INFO(
-    "hnsw::build - Building HNSW index using ACE with %zu partitions, ef_construction=%zu",
-    ace_params.npartitions,
-    ace_params.ef_construction);
+  RAFT_LOG_INFO("HNSW ACE build: building CAGRA graph (partitions=%zu ef_construction=%zu)",
+                cagra_ace_params.npartitions,
+                cagra_ace_params.ef_construction);
 
   // Build CAGRA index using ACE
   auto cagra_index = cuvs::neighbors::cagra::build(res, cagra_params, dataset);
 
-  RAFT_LOG_INFO("hnsw::build - Converting CAGRA index to HNSW format");
+  RAFT_LOG_INFO("HNSW ACE build: converting CAGRA graph to HNSW hierarchy");
 
   // Convert CAGRA index to HNSW index
   return from_cagra<T>(res, params, cagra_index, dataset);
diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp
index 54e9dcf12a..a3d6ca3ab6 100644
--- a/cpp/src/neighbors/hnsw.cpp
+++ b/cpp/src/neighbors/hnsw.cpp
@@ -115,4 +115,17 @@ CUVS_INST_HNSW_SERIALIZE(int8_t);
 
 #undef CUVS_INST_HNSW_SERIALIZE
 
+// The element data type is read from the artifact header; the dispatcher selects the typed
+// implementation, instantiating the float/half/uint8_t/int8_t materialize paths in this TU.
+void materialize_to_hnswlib(raft::resources const& res,
+                            const materialize_params& params,
+                            const std::string& layered_artifact_path,
+                            const std::string& output_path,
+                            int dim,
+                            cuvs::distance::DistanceType metric)
+{
+  detail::materialize_layered_to_hnswlib_on_disk_dispatch(
+    res, params, layered_artifact_path, output_path, dim, metric);
+}
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/src/util/file_io.cpp b/cpp/src/util/file_io.cpp
index d924527e72..dac930cad2 100644
--- a/cpp/src/util/file_io.cpp
+++ b/cpp/src/util/file_io.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,10 +10,29 @@
 #include <limits.h>
 
 #include <errno.h>
+#include <fcntl.h>
 #include <unistd.h>
 
 namespace cuvs::util {
 
+void preallocate_file(const file_descriptor& fd, const size_t total_bytes)
+{
+  if (total_bytes == 0) { return; }
+  RAFT_EXPECTS(fd.is_valid(), "File descriptor must be valid");
+  const int rc = posix_fallocate(fd.get(), 0, static_cast<off_t>(total_bytes));
+  if (rc == 0) { return; }
+  // Some filesystems (tmpfs, certain NFS/overlay mounts) do not support preallocation; fall back
+  // to ftruncate so a valid output location is still usable.
+  if (rc == EOPNOTSUPP || rc == EINVAL || rc == ENOSYS) {
+    RAFT_EXPECTS(ftruncate(fd.get(), static_cast<off_t>(total_bytes)) == 0,
+                 "Failed to pre-size file %s via ftruncate: %s",
+                 fd.get_path().c_str(),
+                 strerror(errno));
+    return;
+  }
+  RAFT_FAIL("Failed to pre-allocate file %s: %s", fd.get_path().c_str(), strerror(rc));
+}
+
 void read_large_file(const file_descriptor& fd,
                      void* dest_ptr,
                      const size_t total_bytes,
diff --git a/cpp/tests/neighbors/ann_hnsw_ace.cuh b/cpp/tests/neighbors/ann_hnsw_ace.cuh
index 78ae54d9fc..f0eeaa51ad 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace.cuh
+++ b/cpp/tests/neighbors/ann_hnsw_ace.cuh
@@ -7,9 +7,18 @@
 #include "ann_cagra.cuh"
 
 #include <cuvs/neighbors/hnsw.hpp>
+#include <cuvs/util/file_io.hpp>
 
+#include <array>
+#include <cstdint>
 #include <cstdio>
+#include <ctime>
+#include <exception>
 #include <filesystem>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <vector>
 
 namespace cuvs::neighbors::hnsw {
 
@@ -259,6 +268,438 @@ class AnnHnswAceTest : public ::testing::TestWithParam<AnnHnswAceInputs> {
     std::filesystem::remove_all(temp_dir);
   }
 
+  void testHnswAceLayeredBuildDeserializeSearch()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indexes_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indexes_naive_dev(queries_size, stream_);
+
+      cuvs::neighbors::naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                                         distances_naive_dev.data(),
+                                                         indexes_naive_dev.data(),
+                                                         search_queries.data(),
+                                                         database_dev.data(),
+                                                         ps.n_queries,
+                                                         ps.n_rows,
+                                                         ps.dim,
+                                                         ps.k,
+                                                         ps.metric);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_layered_test_") +
+                           std::to_string(std::time(nullptr)) + "_" +
+                           std::to_string(reinterpret_cast<uintptr_t>(this));
+    std::filesystem::create_directories(temp_dir);
+    struct temp_dir_cleanup {
+      std::string path;
+      ~temp_dir_cleanup()
+      {
+        std::error_code ec;
+        std::filesystem::remove_all(path, ec);
+      }
+    } cleanup{temp_dir};
+
+    auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+    raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_);
+    auto queries_host = raft::make_host_matrix<DataT, int64_t>(ps.n_queries, ps.dim);
+    raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_);
+    raft::resource::sync_stream(handle_);
+
+    const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string();
+    auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file<DataT>(
+      dataset_file, {static_cast<size_t>(ps.n_rows), static_cast<size_t>(ps.dim)});
+    cuvs::util::write_large_file(dataset_fd,
+                                 database_host.data_handle(),
+                                 static_cast<size_t>(ps.n_rows) * ps.dim * sizeof(DataT),
+                                 dataset_header_size);
+
+    hnsw::index_params hnsw_params;
+    hnsw_params.metric          = ps.metric;
+    hnsw_params.hierarchy       = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
+    hnsw_params.M               = 32;
+    hnsw_params.ef_construction = ps.ef_construction;
+    hnsw_params.dataset_path    = dataset_file;
+
+    auto ace_params                = graph_build_params::ace_params();
+    ace_params.npartitions         = ps.npartitions;
+    ace_params.build_dir           = temp_dir;
+    ace_params.use_disk            = true;
+    ace_params.max_host_memory_gb  = ps.max_host_memory_gb;
+    ace_params.max_gpu_memory_gb   = ps.max_gpu_memory_gb;
+    hnsw_params.graph_build_params = ace_params;
+
+    auto hnsw_index =
+      hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view()));
+    ASSERT_NE(hnsw_index, nullptr);
+
+    const auto artifact_path = hnsw_index->file_path();
+    ASSERT_FALSE(artifact_path.empty());
+    ASSERT_TRUE(std::filesystem::is_regular_file(artifact_path));
+    EXPECT_EQ(std::filesystem::path(artifact_path).filename().string(), "hnsw_index.cuvs");
+    size_t cuvs_artifact_count = 0;
+    for (const auto& entry : std::filesystem::directory_iterator(temp_dir)) {
+      if (entry.path().extension() == ".cuvs") { ++cuvs_artifact_count; }
+    }
+    EXPECT_EQ(cuvs_artifact_count, 1);
+    EXPECT_FALSE(std::filesystem::exists(std::filesystem::path(temp_dir) / "layered_hnsw"));
+
+    auto indexes_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(ps.n_queries, ps.k);
+    auto distances_hnsw_host = raft::make_host_matrix<DistanceT, int64_t>(ps.n_queries, ps.k);
+
+    hnsw::search_params search_params;
+    search_params.ef          = std::max(ps.ef_construction, ps.k * 2);
+    search_params.num_threads = 1;
+
+    EXPECT_THROW(hnsw::search(handle_,
+                              search_params,
+                              *hnsw_index,
+                              queries_host.view(),
+                              indexes_hnsw_host.view(),
+                              distances_hnsw_host.view()),
+                 std::exception);
+
+    hnsw::index<DataT>* deserialized_index = nullptr;
+    hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &deserialized_index);
+    ASSERT_NE(deserialized_index, nullptr);
+    std::unique_ptr<hnsw::index<DataT>> deserialized_guard(deserialized_index);
+
+    hnsw::search(handle_,
+                 search_params,
+                 *deserialized_guard,
+                 queries_host.view(),
+                 indexes_hnsw_host.view(),
+                 distances_hnsw_host.view());
+
+    std::vector<IdxT> indexes_hnsw_converted(queries_size);
+    std::vector<DistanceT> distances_hnsw(queries_size);
+    for (size_t i = 0; i < queries_size; i++) {
+      indexes_hnsw_converted[i] = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+      distances_hnsw[i]         = distances_hnsw_host.data_handle()[i];
+    }
+
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive,
+                                                 indexes_hnsw_converted,
+                                                 distances_naive,
+                                                 distances_hnsw,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 ps.min_recall))
+      << "Layered HNSW deserialize and search failed recall check";
+
+    const auto copied_artifact =
+      (std::filesystem::path(temp_dir) / "copied_layered" / "hnsw_index.cuvs").string();
+    hnsw::serialize(handle_, copied_artifact, *hnsw_index);
+    EXPECT_TRUE(std::filesystem::is_regular_file(copied_artifact));
+    size_t copied_file_count = 0;
+    for (const auto& entry : std::filesystem::directory_iterator(
+           std::filesystem::path(copied_artifact).parent_path())) {
+      if (entry.is_regular_file()) { ++copied_file_count; }
+    }
+    EXPECT_EQ(copied_file_count, 1);
+
+    hnsw::index<DataT>* copied_index = nullptr;
+    hnsw::deserialize(handle_, hnsw_params, copied_artifact, ps.dim, ps.metric, &copied_index);
+    ASSERT_NE(copied_index, nullptr);
+    std::unique_ptr<hnsw::index<DataT>> copied_guard(copied_index);
+
+    hnsw::search(handle_,
+                 search_params,
+                 *copied_guard,
+                 queries_host.view(),
+                 indexes_hnsw_host.view(),
+                 distances_hnsw_host.view());
+
+    for (size_t i = 0; i < queries_size; i++) {
+      indexes_hnsw_converted[i] = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+      distances_hnsw[i]         = distances_hnsw_host.data_handle()[i];
+    }
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive,
+                                                 indexes_hnsw_converted,
+                                                 distances_naive,
+                                                 distances_hnsw,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 ps.min_recall))
+      << "Copied layered HNSW artifact deserialize and search failed recall check";
+
+    const auto bad_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "bad_magic.cuvs").string();
+    std::filesystem::create_directories(std::filesystem::path(bad_artifact).parent_path());
+    {
+      std::ofstream bad_file(bad_artifact, std::ios::binary);
+      std::array<char, 64> bad_bytes{};
+      bad_file.write(bad_bytes.data(), bad_bytes.size());
+    }
+    hnsw::index<DataT>* bad_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(handle_, hnsw_params, bad_artifact, ps.dim, ps.metric, &bad_index),
+      std::exception);
+
+    const auto bad_version_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "bad_version.cuvs").string();
+    std::filesystem::copy_file(
+      copied_artifact, bad_version_artifact, std::filesystem::copy_options::overwrite_existing);
+    constexpr std::streamoff layered_header_version_offset = 32;
+    {
+      std::fstream bad_version_file(bad_version_artifact,
+                                    std::ios::in | std::ios::out | std::ios::binary);
+      const uint32_t bad_version = 999;
+      bad_version_file.seekp(layered_header_version_offset);
+      bad_version_file.write(reinterpret_cast<const char*>(&bad_version), sizeof(bad_version));
+    }
+    hnsw::index<DataT>* bad_version_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(
+        handle_, hnsw_params, bad_version_artifact, ps.dim, ps.metric, &bad_version_index),
+      std::exception);
+
+    auto missing_dataset_params = hnsw_params;
+    missing_dataset_params.dataset_path.clear();
+    hnsw::index<DataT>* missing_dataset_index = nullptr;
+    EXPECT_THROW(hnsw::deserialize(handle_,
+                                   missing_dataset_params,
+                                   copied_artifact,
+                                   ps.dim,
+                                   ps.metric,
+                                   &missing_dataset_index),
+                 std::exception);
+
+    const auto truncated_artifact =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "truncated.cuvs").string();
+    std::filesystem::copy_file(
+      copied_artifact, truncated_artifact, std::filesystem::copy_options::overwrite_existing);
+    std::filesystem::resize_file(truncated_artifact, 128);
+    hnsw::index<DataT>* truncated_index = nullptr;
+    EXPECT_THROW(hnsw::deserialize(
+                   handle_, hnsw_params, truncated_artifact, ps.dim, ps.metric, &truncated_index),
+                 std::exception);
+
+    const auto wrong_dataset_file =
+      (std::filesystem::path(temp_dir) / "bad_layered" / "wrong_dataset.npy").string();
+    auto [wrong_dataset_fd, wrong_dataset_header_size] = cuvs::util::create_numpy_file<DataT>(
+      wrong_dataset_file, {static_cast<size_t>(ps.n_rows - 1), static_cast<size_t>(ps.dim)});
+    cuvs::util::write_large_file(wrong_dataset_fd,
+                                 database_host.data_handle(),
+                                 static_cast<size_t>(ps.n_rows - 1) * ps.dim * sizeof(DataT),
+                                 wrong_dataset_header_size);
+    auto wrong_dataset_params               = hnsw_params;
+    wrong_dataset_params.dataset_path       = wrong_dataset_file;
+    hnsw::index<DataT>* wrong_dataset_index = nullptr;
+    EXPECT_THROW(
+      hnsw::deserialize(
+        handle_, wrong_dataset_params, copied_artifact, ps.dim, ps.metric, &wrong_dataset_index),
+      std::exception);
+  }
+
+  void testHnswAceLayeredMaterializeToHnswlib()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indexes_naive(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indexes_naive_dev(queries_size, stream_);
+      cuvs::neighbors::naive_knn<DistanceT, DataT, IdxT>(handle_,
+                                                         distances_naive_dev.data(),
+                                                         indexes_naive_dev.data(),
+                                                         search_queries.data(),
+                                                         database_dev.data(),
+                                                         ps.n_queries,
+                                                         ps.n_rows,
+                                                         ps.dim,
+                                                         ps.k,
+                                                         ps.metric);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_materialize_test_") +
+                           std::to_string(std::time(nullptr)) + "_" +
+                           std::to_string(reinterpret_cast<uintptr_t>(this));
+    std::filesystem::create_directories(temp_dir);
+    struct temp_dir_cleanup {
+      std::string path;
+      ~temp_dir_cleanup()
+      {
+        std::error_code ec;
+        std::filesystem::remove_all(path, ec);
+      }
+    } cleanup{temp_dir};
+
+    auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+    raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_);
+    auto queries_host = raft::make_host_matrix<DataT, int64_t>(ps.n_queries, ps.dim);
+    raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_);
+    raft::resource::sync_stream(handle_);
+
+    const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string();
+    auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file<DataT>(
+      dataset_file, {static_cast<size_t>(ps.n_rows), static_cast<size_t>(ps.dim)});
+    cuvs::util::write_large_file(dataset_fd,
+                                 database_host.data_handle(),
+                                 static_cast<size_t>(ps.n_rows) * ps.dim * sizeof(DataT),
+                                 dataset_header_size);
+
+    hnsw::index_params hnsw_params;
+    hnsw_params.metric          = ps.metric;
+    hnsw_params.hierarchy       = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
+    hnsw_params.M               = 32;
+    hnsw_params.ef_construction = ps.ef_construction;
+    hnsw_params.dataset_path    = dataset_file;
+
+    auto ace_params                = graph_build_params::ace_params();
+    ace_params.npartitions         = ps.npartitions;
+    ace_params.build_dir           = temp_dir;
+    ace_params.use_disk            = true;
+    ace_params.max_host_memory_gb  = ps.max_host_memory_gb;
+    ace_params.max_gpu_memory_gb   = ps.max_gpu_memory_gb;
+    hnsw_params.graph_build_params = ace_params;
+
+    auto hnsw_index =
+      hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view()));
+    ASSERT_NE(hnsw_index, nullptr);
+    const auto artifact_path = hnsw_index->file_path();
+    ASSERT_FALSE(artifact_path.empty());
+
+    hnsw::search_params search_params;
+    search_params.ef          = std::max(ps.ef_construction, ps.k * 2);
+    search_params.num_threads = 1;
+
+    auto indexes_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(ps.n_queries, ps.k);
+    auto distances_hnsw_host = raft::make_host_matrix<DistanceT, int64_t>(ps.n_queries, ps.k);
+
+    // Reference: load the layered artifact in RAM and search.
+    std::vector<IdxT> indexes_layered(queries_size);
+    std::vector<DistanceT> distances_layered(queries_size);
+    {
+      hnsw::index<DataT>* layered_index = nullptr;
+      hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &layered_index);
+      ASSERT_NE(layered_index, nullptr);
+      std::unique_ptr<hnsw::index<DataT>> layered_guard(layered_index);
+      hnsw::search(handle_,
+                   search_params,
+                   *layered_guard,
+                   queries_host.view(),
+                   indexes_hnsw_host.view(),
+                   distances_hnsw_host.view());
+      for (size_t i = 0; i < queries_size; i++) {
+        indexes_layered[i]   = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+        distances_layered[i] = distances_hnsw_host.data_handle()[i];
+      }
+    }
+
+    // Materialize to a standard hnswlib index file twice: single in-memory pass and bucketed.
+    const auto out_single = (std::filesystem::path(temp_dir) / "materialized_single.bin").string();
+    const auto out_bucketed =
+      (std::filesystem::path(temp_dir) / "materialized_bucketed.bin").string();
+
+    hnsw::materialize_params materialize_single;
+    materialize_single.dataset_path       = dataset_file;
+    materialize_single.max_host_memory_gb = 0;  // single in-memory reorder pass
+    hnsw::materialize_to_hnswlib(
+      handle_, materialize_single, artifact_path, out_single, ps.dim, ps.metric);
+
+    hnsw::materialize_params materialize_bucketed;
+    materialize_bucketed.dataset_path       = dataset_file;
+    materialize_bucketed.max_host_memory_gb = 0.00003;  // tiny budget forces bucketed base + upper
+    hnsw::materialize_to_hnswlib(
+      handle_, materialize_bucketed, artifact_path, out_bucketed, ps.dim, ps.metric);
+
+    ASSERT_TRUE(std::filesystem::is_regular_file(out_single));
+    ASSERT_TRUE(std::filesystem::is_regular_file(out_bucketed));
+
+    // Determinism: single-pass and bucketed outputs must be byte-identical.
+    {
+      const auto read_all = [](const std::string& path) {
+        std::ifstream stream(path, std::ios::binary);
+        return std::vector<char>((std::istreambuf_iterator<char>(stream)),
+                                 std::istreambuf_iterator<char>());
+      };
+      const auto bytes_single   = read_all(out_single);
+      const auto bytes_bucketed = read_all(out_bucketed);
+      ASSERT_FALSE(bytes_single.empty());
+      EXPECT_EQ(bytes_single.size(), bytes_bucketed.size());
+      EXPECT_TRUE(bytes_single == bytes_bucketed)
+        << "Single-pass and bucketed materialized outputs differ";
+    }
+
+    // Load the materialized file as a standard (CPU) hnswlib index and search.
+    hnsw::index_params cpu_params;
+    cpu_params.metric             = ps.metric;
+    cpu_params.hierarchy          = hnsw::HnswHierarchy::CPU;
+    hnsw::index<DataT>* cpu_index = nullptr;
+    hnsw::deserialize(handle_, cpu_params, out_single, ps.dim, ps.metric, &cpu_index);
+    ASSERT_NE(cpu_index, nullptr);
+    std::unique_ptr<hnsw::index<DataT>> cpu_guard(cpu_index);
+
+    hnsw::search(handle_,
+                 search_params,
+                 *cpu_guard,
+                 queries_host.view(),
+                 indexes_hnsw_host.view(),
+                 distances_hnsw_host.view());
+
+    std::vector<IdxT> indexes_cpu(queries_size);
+    std::vector<DistanceT> distances_cpu(queries_size);
+    for (size_t i = 0; i < queries_size; i++) {
+      indexes_cpu[i]   = static_cast<IdxT>(indexes_hnsw_host.data_handle()[i]);
+      distances_cpu[i] = distances_hnsw_host.data_handle()[i];
+    }
+
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive,
+                                                 indexes_cpu,
+                                                 distances_naive,
+                                                 distances_cpu,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 ps.min_recall))
+      << "Materialized hnswlib index failed recall check vs. ground truth";
+
+    // The materialized CPU index represents the same graph and vectors as the layered artifact,
+    // so its search results must match the in-memory layered path.
+    EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_layered,
+                                                 indexes_cpu,
+                                                 distances_layered,
+                                                 distances_cpu,
+                                                 ps.n_queries,
+                                                 ps.k,
+                                                 0.003,
+                                                 0.99))
+      << "Materialized hnswlib index disagrees with the in-memory layered path";
+
+    hnsw::materialize_params bad_params;
+    bad_params.dataset_path = dataset_file;
+    const auto err_out      = (std::filesystem::path(temp_dir) / "err.bin").string();
+    EXPECT_THROW(hnsw::materialize_to_hnswlib(
+                   handle_, bad_params, artifact_path, err_out, ps.dim + 1, ps.metric),
+                 std::exception);
+
+    const auto wrong_metric = ps.metric == cuvs::distance::DistanceType::L2Expanded
+                                ? cuvs::distance::DistanceType::InnerProduct
+                                : cuvs::distance::DistanceType::L2Expanded;
+    EXPECT_THROW(hnsw::materialize_to_hnswlib(
+                   handle_, bad_params, artifact_path, err_out, ps.dim, wrong_metric),
+                 std::exception);
+
+    hnsw::materialize_params missing_dataset;
+    missing_dataset.dataset_path.clear();
+    EXPECT_THROW(hnsw::materialize_to_hnswlib(
+                   handle_, missing_dataset, artifact_path, err_out, ps.dim, ps.metric),
+                 std::exception);
+  }
+
   void SetUp() override
   {
     database_dev.resize(((size_t)ps.n_rows) * ps.dim, stream_);
@@ -323,8 +764,26 @@ inline std::vector<AnnHnswAceInputs> generate_hnsw_ace_memory_fallback_inputs()
   };
 }
 
+inline std::vector<AnnHnswAceInputs> generate_hnsw_ace_layered_inputs()
+{
+  return {
+    {10,    // n_queries
+     5000,  // n_rows
+     64,    // dim
+     10,    // k
+     2,     // npartitions
+     100,   // ef_construction
+     true,  // use_disk
+     cuvs::distance::DistanceType::L2Expanded,
+     0.9,  // min_recall
+     0.0,  // max_host_memory_gb
+     0.0}  // max_gpu_memory_gb
+  };
+}
+
 const std::vector<AnnHnswAceInputs> hnsw_ace_inputs = generate_hnsw_ace_inputs();
 const std::vector<AnnHnswAceInputs> hnsw_ace_memory_fallback_inputs =
   generate_hnsw_ace_memory_fallback_inputs();
+const std::vector<AnnHnswAceInputs> hnsw_ace_layered_inputs = generate_hnsw_ace_layered_inputs();
 
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
index 57b16d74e9..33a272a675 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu
@@ -23,4 +23,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
                         AnnHnswAceMemoryFallbackTest_float,
                         ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
 
+typedef AnnHnswAceTest<float, float, uint32_t> AnnHnswAceLayeredTest_float;
+TEST_P(AnnHnswAceLayeredTest_float, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_float,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
+typedef AnnHnswAceTest<float, float, uint32_t> AnnHnswAceMaterializeTest_float;
+TEST_P(AnnHnswAceMaterializeTest_float, AnnHnswAceLayeredMaterializeToHnswlib)
+{
+  this->testHnswAceLayeredMaterializeToHnswlib();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest,
+                        AnnHnswAceMaterializeTest_float,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
index 38f75b2afa..acde405a3b 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu
@@ -12,4 +12,34 @@ TEST_P(AnnHnswAceTest_half, AnnHnswAceBuild) { this->testHnswAceBuild(); }
 
 INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_half, ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, half, uint32_t> AnnHnswAceMemoryFallbackTest_half;
+TEST_P(AnnHnswAceMemoryFallbackTest_half, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_half,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, half, uint32_t> AnnHnswAceLayeredTest_half;
+TEST_P(AnnHnswAceLayeredTest_half, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_half,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
+typedef AnnHnswAceTest<float, half, uint32_t> AnnHnswAceMaterializeTest_half;
+TEST_P(AnnHnswAceMaterializeTest_half, AnnHnswAceLayeredMaterializeToHnswlib)
+{
+  this->testHnswAceLayeredMaterializeToHnswlib();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest,
+                        AnnHnswAceMaterializeTest_half,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
index 279df6555f..a6fe986898 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu
@@ -14,4 +14,34 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest,
                         AnnHnswAceTest_int8_t,
                         ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, int8_t, uint32_t> AnnHnswAceMemoryFallbackTest_int8_t;
+TEST_P(AnnHnswAceMemoryFallbackTest_int8_t, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_int8_t,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, int8_t, uint32_t> AnnHnswAceLayeredTest_int8_t;
+TEST_P(AnnHnswAceLayeredTest_int8_t, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_int8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
+typedef AnnHnswAceTest<float, int8_t, uint32_t> AnnHnswAceMaterializeTest_int8_t;
+TEST_P(AnnHnswAceMaterializeTest_int8_t, AnnHnswAceLayeredMaterializeToHnswlib)
+{
+  this->testHnswAceLayeredMaterializeToHnswlib();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest,
+                        AnnHnswAceMaterializeTest_int8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
index 7e68dc4b17..13c553f260 100644
--- a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
+++ b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu
@@ -14,4 +14,34 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest,
                         AnnHnswAceTest_uint8_t,
                         ::testing::ValuesIn(hnsw_ace_inputs));
 
+typedef AnnHnswAceTest<float, uint8_t, uint32_t> AnnHnswAceMemoryFallbackTest_uint8_t;
+TEST_P(AnnHnswAceMemoryFallbackTest_uint8_t, AnnHnswAceMemoryLimitFallback)
+{
+  this->testHnswAceMemoryLimitFallback();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest,
+                        AnnHnswAceMemoryFallbackTest_uint8_t,
+                        ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs));
+
+typedef AnnHnswAceTest<float, uint8_t, uint32_t> AnnHnswAceLayeredTest_uint8_t;
+TEST_P(AnnHnswAceLayeredTest_uint8_t, AnnHnswAceLayeredBuildDeserializeSearch)
+{
+  this->testHnswAceLayeredBuildDeserializeSearch();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest,
+                        AnnHnswAceLayeredTest_uint8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
+typedef AnnHnswAceTest<float, uint8_t, uint32_t> AnnHnswAceMaterializeTest_uint8_t;
+TEST_P(AnnHnswAceMaterializeTest_uint8_t, AnnHnswAceLayeredMaterializeToHnswlib)
+{
+  this->testHnswAceLayeredMaterializeToHnswlib();
+}
+
+INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest,
+                        AnnHnswAceMaterializeTest_uint8_t,
+                        ::testing::ValuesIn(hnsw_ace_layered_inputs));
+
 }  // namespace cuvs::neighbors::hnsw
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index d63ddbdb71..7a36217bae 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(CAGRA_HNSW_ACE_EXAMPLE src/cagra_hnsw_ace_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
 add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
 add_executable(HNSW_ACE_EXAMPLE src/hnsw_ace_example.cu)
+add_executable(HNSW_ACE_LAYERED_EXAMPLE src/hnsw_ace_layered_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
@@ -56,6 +57,7 @@ target_link_libraries(
   DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
 )
 target_link_libraries(HNSW_ACE_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(HNSW_ACE_LAYERED_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu
new file mode 100644
index 0000000000..ced22ff3fe
--- /dev/null
+++ b/examples/cpp/src/hnsw_ace_layered_example.cu
@@ -0,0 +1,354 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Large graphs exceeding the memory capacity can be built using the Augmented Core Extraction (ACE)
+// algorithm, which partitions the dataset. The resulting HNSW index is too large to fit in memory
+// as well. Thus, the index needs to be transferred to a search server with enough memory.
+//
+// HNSWHierarchy::GPU_LAYERED_ON_DISK is a special hierarchy that builds a layered HNSW index on
+// disk. It emits one topology-only artifact, hnsw_index.cuvs. The dataset remains separate and does
+// not need to be transferred to the search server, which typically has the dataset locally.
+//
+// This example demonstrates how to build a layered HNSW index with ACE and turn it into a standard
+// hnswlib index for in-memory search:
+//
+// 1. Optionally quantize the dataset and queries to int8.
+// 2. Build a single-file layered HNSW artifact with ACE using hnsw::build.
+// 3. Materialize the layered artifact into a standard hnswlib index file on disk using
+//    hnsw::materialize_to_hnswlib (disk-to-disk, never holding the full index in host memory).
+// 4. Read the materialized hnswlib index into memory using hnsw::deserialize (hierarchy = CPU).
+// 5. Search the in-memory HNSW index.
+//
+// Layered-on-disk layout:
+//
+//   index_dir/hnsw_index.cuvs
+//     fixed header + metadata JSON
+//     levels: uint8 [N], max HNSW level for each original row id
+//     base nodes + base links: uint32 node ids with hnswlib-ready link rows
+//     upper nodes + upper links: hnswlib-ready upper-layer topology
+//
+// The transferred index artifact is topology-only. The dataset is loaded locally during
+// materialization from hnsw::materialize_params::dataset_path. The loader supports .npy and ANN
+// benchmark *.bin datasets; this example writes a local dataset .npy only to make the demo
+// self-contained. The materialized hnswlib index file is self-contained (it embeds the vectors),
+// so reading it back needs no dataset path.
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/random/make_blobs.cuh>
+
+#include <cuvs/neighbors/hnsw.hpp>
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <cuvs/util/file_io.hpp>
+
+#include <rmm/mr/pool_memory_resource.hpp>
+
+#include "common.cuh"
+
+// When 1, scalar-quantize the float dataset to int8.
+#define HNSW_ACE_LAYERED_USE_QUANTIZATION 1
+
+namespace {
+
+constexpr const char* kBuildDir = "/tmp/hnsw_ace_layered";
+
+// Reports the wall-clock time of a callable in milliseconds.
+template <typename F>
+double time_ms(F&& fn)
+{
+  const auto start = std::chrono::steady_clock::now();
+  fn();
+  return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start)
+    .count();
+}
+
+double to_gib(double bytes) { return bytes / (1024.0 * 1024.0 * 1024.0); }
+
+template <typename T>
+std::string write_local_dataset(raft::host_matrix_view<const T, int64_t> dataset,
+                                const std::string& path)
+{
+  auto [fd, header_size] = cuvs::util::create_numpy_file<T>(
+    path, {static_cast<size_t>(dataset.extent(0)), static_cast<size_t>(dataset.extent(1))});
+  cuvs::util::write_large_file(
+    fd, dataset.data_handle(), dataset.extent(0) * dataset.extent(1) * sizeof(T), header_size);
+  return path;
+}
+
+template <typename T>
+struct quantized_pair {
+  raft::host_matrix<T, int64_t> dataset;
+  raft::host_matrix<T, int64_t> queries;
+};
+
+quantized_pair<int8_t> quantize_dataset(raft::device_resources const& dev_resources,
+                                        raft::host_matrix_view<const float, int64_t> dataset_float,
+                                        raft::host_matrix_view<const float, int64_t> queries_float)
+{
+  std::cout << "  quantize_dataset: training scalar quantizer (float -> int8)" << std::endl;
+  cuvs::preprocessing::quantize::scalar::params qp;
+  auto quantizer = cuvs::preprocessing::quantize::scalar::train(dev_resources, qp, dataset_float);
+
+  auto dataset_i8 =
+    raft::make_host_matrix<int8_t, int64_t>(dataset_float.extent(0), dataset_float.extent(1));
+  cuvs::preprocessing::quantize::scalar::transform(
+    dev_resources, quantizer, dataset_float, dataset_i8.view());
+
+  auto queries_i8 =
+    raft::make_host_matrix<int8_t, int64_t>(queries_float.extent(0), queries_float.extent(1));
+  cuvs::preprocessing::quantize::scalar::transform(
+    dev_resources, quantizer, queries_float, queries_i8.view());
+
+  return {std::move(dataset_i8), std::move(queries_i8)};
+}
+
+auto make_hnsw_ace_params(const std::string& build_dir, const std::string& dataset_path)
+  -> cuvs::neighbors::hnsw::index_params
+{
+  using namespace cuvs::neighbors;
+
+  hnsw::index_params hnsw_params;
+  hnsw_params.metric          = cuvs::distance::DistanceType::L2Expanded;
+  hnsw_params.hierarchy       = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK;
+  hnsw_params.M               = 32;
+  hnsw_params.dataset_path    = dataset_path;  // Override this path on the search server.
+  hnsw_params.ef_construction = 120;
+
+  auto ace_params                = hnsw::graph_build_params::ace_params();
+  ace_params.npartitions         = 4;
+  ace_params.use_disk            = true;
+  ace_params.build_dir           = build_dir;
+  hnsw_params.graph_build_params = ace_params;
+
+  return hnsw_params;
+}
+
+template <typename T>
+auto hnsw_build(raft::device_resources const& dev_resources,
+                const cuvs::neighbors::hnsw::index_params& hnsw_params,
+                raft::host_matrix_view<const T, int64_t> dataset) -> std::string
+{
+  using namespace cuvs::neighbors;
+
+  std::unique_ptr<hnsw::index<T>> hnsw_index;
+  const auto build_ms =
+    time_ms([&]() { hnsw_index = hnsw::build(dev_resources, hnsw_params, dataset); });
+  const auto artifact_path = hnsw_index->file_path();
+  if (artifact_path.empty()) {
+    throw std::runtime_error("Expected layered HNSW build to return an artifact path.");
+  }
+  const auto artifact_bytes = static_cast<double>(std::filesystem::file_size(artifact_path));
+  std::cout << "  hnsw_build: layered artifact written to " << artifact_path << "\n"
+            << "  hnsw_build: build wall time " << build_ms << " ms, artifact "
+            << to_gib(artifact_bytes) << " GiB" << std::endl;
+  return artifact_path;
+}
+
+// Materialize the layered artifact into a standard hnswlib index file on disk and time the
+// disk-to-disk materialization. Returns the path to the native hnswlib index file.
+template <typename T>
+auto hnsw_materialize(raft::device_resources const& dev_resources,
+                      const cuvs::neighbors::hnsw::index_params& hnsw_params,
+                      const std::string& artifact_path,
+                      const std::string& dataset_path,
+                      int64_t dim,
+                      const std::string& output_path) -> std::string
+{
+  using namespace cuvs::neighbors;
+
+  hnsw::materialize_params materialize_params;
+  materialize_params.dataset_path       = dataset_path;
+  materialize_params.max_host_memory_gb = 0;  // 0 => single in-memory reorder pass
+  materialize_params.num_threads        = 0;  // 0 => max threads
+
+  const auto materialize_ms = time_ms([&]() {
+    hnsw::materialize_to_hnswlib(dev_resources,
+                                 materialize_params,
+                                 artifact_path,
+                                 output_path,
+                                 static_cast<int>(dim),
+                                 hnsw_params.metric);
+  });
+
+  const auto native_bytes = static_cast<double>(std::filesystem::file_size(output_path));
+  std::cout << "  hnsw_materialize: native hnswlib index written to " << output_path << "\n"
+            << "  hnsw_materialize: wall time " << materialize_ms << " ms, output "
+            << to_gib(native_bytes) << " GiB" << std::endl;
+  return output_path;
+}
+
+// Read the materialized hnswlib index into memory for search. The materialized file is a standard
+// hnswlib index, so it is loaded with hierarchy == CPU and needs no dataset path (the file already
+// embeds the vectors).
+template <typename T>
+auto hnsw_load_native(raft::device_resources const& dev_resources,
+                      const std::string& native_index_path,
+                      cuvs::distance::DistanceType metric,
+                      int64_t dim) -> std::unique_ptr<cuvs::neighbors::hnsw::index<T>>
+{
+  using namespace cuvs::neighbors;
+
+  hnsw::index_params load_params;
+  load_params.hierarchy = hnsw::HnswHierarchy::CPU;
+  load_params.metric    = metric;
+
+  hnsw::index<T>* loaded_index = nullptr;
+  hnsw::deserialize(
+    dev_resources, load_params, native_index_path, static_cast<int>(dim), metric, &loaded_index);
+  return std::unique_ptr<hnsw::index<T>>(loaded_index);
+}
+
+template <typename T>
+void hnsw_search(raft::device_resources const& dev_resources,
+                 const cuvs::neighbors::hnsw::index<T>& hnsw_index,
+                 raft::host_matrix_view<const T, int64_t> queries,
+                 int64_t topk = 12)
+{
+  using namespace cuvs::neighbors;
+
+  const int64_t n_queries  = queries.extent(0);
+  auto indices_hnsw_host   = raft::make_host_matrix<uint64_t, int64_t>(n_queries, topk);
+  auto distances_hnsw_host = raft::make_host_matrix<float, int64_t>(n_queries, topk);
+
+  hnsw::search_params search_params;
+  search_params.ef          = std::max(200, static_cast<int>(topk) * 2);
+  search_params.num_threads = 1;
+
+  hnsw::search(dev_resources,
+               search_params,
+               hnsw_index,
+               queries,
+               indices_hnsw_host.view(),
+               distances_hnsw_host.view());
+
+  auto neighbors      = raft::make_device_matrix<uint32_t>(dev_resources, n_queries, topk);
+  auto distances      = raft::make_device_matrix<float>(dev_resources, n_queries, topk);
+  auto neighbors_host = raft::make_host_matrix<uint32_t, int64_t>(n_queries, topk);
+  for (int64_t i = 0; i < n_queries; ++i) {
+    for (int64_t j = 0; j < topk; ++j) {
+      neighbors_host(i, j) = static_cast<uint32_t>(indices_hnsw_host(i, j));
+    }
+  }
+
+  raft::copy(neighbors.data_handle(),
+             neighbors_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::copy(distances.data_handle(),
+             distances_hnsw_host.data_handle(),
+             n_queries * topk,
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  print_results(dev_resources, neighbors.view(), distances.view());
+}
+
+}  // namespace
+
+int main()
+{
+  raft::device_resources dev_resources;
+
+  // Surface the per-phase build/materialize timing logs (RAFT_LOG_INFO).
+  raft::default_logger().set_level(rapids_logger::level_enum::info);
+
+  rmm::mr::pool_memory_resource pool_mr(rmm::mr::get_current_device_resource_ref(),
+                                        1024 * 1024 * 1024ull);
+  rmm::mr::set_current_device_resource(pool_mr);
+
+#if HNSW_ACE_LAYERED_USE_QUANTIZATION
+  std::cout << "[stage 1] Generate and quantize dataset (float -> int8)" << std::endl;
+#else
+  std::cout << "[stage 1] Generate dataset (float)" << std::endl;
+#endif
+
+  int64_t n_samples = 10000;
+  int64_t n_dim     = 90;
+  int64_t n_queries = 10;
+  auto dataset      = raft::make_device_matrix<float, int64_t>(dev_resources, n_samples, n_dim);
+  auto queries      = raft::make_device_matrix<float, int64_t>(dev_resources, n_queries, n_dim);
+  generate_dataset(dev_resources, dataset.view(), queries.view());
+
+  auto dataset_host = raft::make_host_matrix<float, int64_t>(n_samples, n_dim);
+  auto queries_host = raft::make_host_matrix<float, int64_t>(n_queries, n_dim);
+  raft::copy(dataset_host.data_handle(),
+             dataset.data_handle(),
+             dataset.extent(0) * dataset.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::copy(queries_host.data_handle(),
+             queries.data_handle(),
+             queries.extent(0) * queries.extent(1),
+             raft::resource::get_cuda_stream(dev_resources));
+  raft::resource::sync_stream(dev_resources);
+
+  auto dataset_host_view = raft::make_host_matrix_view<const float, int64_t, raft::row_major>(
+    dataset_host.data_handle(), n_samples, n_dim);
+  auto queries_host_view = raft::make_host_matrix_view<const float, int64_t, raft::row_major>(
+    queries_host.data_handle(), n_queries, n_dim);
+
+  std::filesystem::create_directories(kBuildDir);
+
+#if HNSW_ACE_LAYERED_USE_QUANTIZATION
+  auto q               = quantize_dataset(dev_resources, dataset_host_view, queries_host_view);
+  auto dataset_i8_view = raft::make_host_matrix_view<const int8_t, int64_t, raft::row_major>(
+    q.dataset.data_handle(), n_samples, n_dim);
+  auto queries_i8_view = raft::make_host_matrix_view<const int8_t, int64_t, raft::row_major>(
+    q.queries.data_handle(), n_queries, n_dim);
+  auto dataset_path = write_local_dataset(dataset_i8_view, std::string{kBuildDir} + "/dataset.npy");
+  auto hnsw_params  = make_hnsw_ace_params(kBuildDir, dataset_path);
+
+  const std::string native_index_path = std::string{kBuildDir} + "/hnsw_native.bin";
+
+  std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl;
+  auto artifact_path = hnsw_build<int8_t>(dev_resources, hnsw_params, dataset_i8_view);
+
+  std::cout << "[stage 3] Materialize layered HNSW -> native hnswlib index" << std::endl;
+  hnsw_materialize<int8_t>(
+    dev_resources, hnsw_params, artifact_path, dataset_path, n_dim, native_index_path);
+
+  std::cout << "[stage 4] Read materialized hnswlib index into memory" << std::endl;
+  auto hnsw_index =
+    hnsw_load_native<int8_t>(dev_resources, native_index_path, hnsw_params.metric, n_dim);
+
+  std::cout << "[stage 5] Search HNSW index" << std::endl;
+  hnsw_search<int8_t>(dev_resources, *hnsw_index, queries_i8_view);
+#else
+  auto dataset_path =
+    write_local_dataset(dataset_host_view, std::string{kBuildDir} + "/dataset.npy");
+  auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path);
+
+  const std::string native_index_path = std::string{kBuildDir} + "/hnsw_native.bin";
+
+  std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl;
+  auto artifact_path = hnsw_build<float>(dev_resources, hnsw_params, dataset_host_view);
+
+  std::cout << "[stage 3] Materialize layered HNSW -> native hnswlib index" << std::endl;
+  hnsw_materialize<float>(
+    dev_resources, hnsw_params, artifact_path, dataset_path, n_dim, native_index_path);
+
+  std::cout << "[stage 4] Read materialized hnswlib index into memory" << std::endl;
+  auto hnsw_index =
+    hnsw_load_native<float>(dev_resources, native_index_path, hnsw_params.metric, n_dim);
+
+  std::cout << "[stage 5] Search HNSW index" << std::endl;
+  hnsw_search<float>(dev_resources, *hnsw_index, queries_host_view);
+#endif
+
+  return 0;
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
index 3eef491b62..14d897521e 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
@@ -82,6 +82,44 @@ static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParams, CuVS
     return CuVSProvider.provider().hnswIndexBuild(resources, hnswParams, dataset);
   }
 
+  /**
+   * Materializes a layered HNSW artifact into a standard hnswlib index file on
+   * disk.
+   *
+   * Materializes a {@code GPU_LAYERED_ON_DISK} artifact (graph topology only,
+   * stored in ACE order) plus a local dataset into a standard hnswlib index file,
+   * without ever holding the full materialized index in host memory. The
+   * resulting file is compatible with the original hnswlib library and can be read
+   * back with {@code hierarchy == CPU}. The element data type is read from the
+   * artifact header.
+   *
+   * @param resources The CuVS resources
+   * @param materializeParams Materialization parameters (dataset path, host-memory
+   *     budget, threads)
+   * @param layeredArtifactPath Path to the layered HNSW artifact
+   * @param outputPath Path to the hnswlib index file to write
+   * @param dim The dimension of the vectors in the index
+   * @param metric The distance metric used to build the index
+   * @throws Throwable if an error occurs during materialization
+   */
+  static void materializeToHnswlib(
+      CuVSResources resources,
+      HnswMaterializeParams materializeParams,
+      String layeredArtifactPath,
+      String outputPath,
+      int dim,
+      HnswIndexParams.CuvsDistanceType metric)
+      throws Throwable {
+    Objects.requireNonNull(resources);
+    Objects.requireNonNull(materializeParams);
+    Objects.requireNonNull(layeredArtifactPath);
+    Objects.requireNonNull(outputPath);
+    Objects.requireNonNull(metric);
+    CuVSProvider.provider()
+        .hnswMaterializeToHnswlib(
+            resources, materializeParams, layeredArtifactPath, outputPath, dim, metric);
+  }
+
   /**
    * Builder helps configure and create an instance of {@link HnswIndex}.
    */
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
index 070cbedae1..6de60ed4f1 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
@@ -46,7 +46,16 @@ public enum CuvsHnswHierarchy {
     /**
      * Full hierarchy is built using the GPU
      */
-    GPU(2);
+    GPU(2),
+
+    /**
+     * GPU-built hierarchy stored as a layered on-disk topology artifact.
+     *
+     * The artifact stores graph topology only. When loading such an artifact, the
+     * {@code datasetPath} must point to the original-ID-ordered vectors used to
+     * reconstruct an in-memory HNSW index.
+     */
+    GPU_LAYERED_ON_DISK(3);
 
     /**
      * The value for the enum choice.
@@ -65,6 +74,8 @@ public static class HnswHierarchy {
     public static final CuvsHnswHierarchy NONE = CuvsHnswHierarchy.NONE;
     public static final CuvsHnswHierarchy CPU = CuvsHnswHierarchy.CPU;
     public static final CuvsHnswHierarchy GPU = CuvsHnswHierarchy.GPU;
+    public static final CuvsHnswHierarchy GPU_LAYERED_ON_DISK =
+        CuvsHnswHierarchy.GPU_LAYERED_ON_DISK;
   }
 
   private CuvsHnswHierarchy hierarchy = CuvsHnswHierarchy.GPU;
@@ -74,6 +85,7 @@ public static class HnswHierarchy {
   private long m = 32;
   private CuvsDistanceType metric = CuvsDistanceType.L2Expanded;
   private HnswAceParams aceParams;
+  private String datasetPath;
 
   private HnswIndexParams(
       CuvsHnswHierarchy hierarchy,
@@ -82,7 +94,8 @@ private HnswIndexParams(
       int vectorDimension,
       long m,
       CuvsDistanceType metric,
-      HnswAceParams aceParams) {
+      HnswAceParams aceParams,
+      String datasetPath) {
     this.hierarchy = hierarchy;
     this.efConstruction = efConstruction;
     this.numThreads = numThreads;
@@ -90,6 +103,7 @@ private HnswIndexParams(
     this.m = m;
     this.metric = metric;
     this.aceParams = aceParams;
+    this.datasetPath = datasetPath;
   }
 
   /**
@@ -153,6 +167,20 @@ public HnswAceParams getAceParams() {
     return aceParams;
   }
 
+  /**
+   * Gets the local dataset path used by layered HNSW deserialization.
+   *
+   * Required when {@code hierarchy == GPU_LAYERED_ON_DISK}: the artifact stores
+   * graph topology only, and loading reads the original-ID-ordered vectors from
+   * this path to reconstruct an in-memory HNSW index. Ignored for all other
+   * hierarchies.
+   *
+   * @return the dataset path, or null if not set
+   */
+  public String getDatasetPath() {
+    return datasetPath;
+  }
+
   @Override
   public String toString() {
     return "HnswIndexParams [hierarchy="
@@ -169,6 +197,8 @@ public String toString() {
         + metric
         + ", aceParams="
         + aceParams
+        + ", datasetPath="
+        + datasetPath
         + "]";
   }
 
@@ -184,6 +214,7 @@ public static class Builder {
     private long m = 32;
     private CuvsDistanceType metric = CuvsDistanceType.L2Expanded;
     private HnswAceParams aceParams;
+    private String datasetPath;
 
     /**
      * Constructs this Builder with an instance of Arena.
@@ -276,6 +307,22 @@ public Builder withAceParams(HnswAceParams aceParams) {
       return this;
     }
 
+    /**
+     * Sets the local dataset path used by layered HNSW deserialization.
+     *
+     * Required when {@code hierarchy == GPU_LAYERED_ON_DISK}: the artifact stores
+     * graph topology only, and loading reads the original-ID-ordered vectors from
+     * this path to reconstruct an in-memory HNSW index. Ignored for all other
+     * hierarchies.
+     *
+     * @param datasetPath the local dataset path
+     * @return an instance of Builder
+     */
+    public Builder withDatasetPath(String datasetPath) {
+      this.datasetPath = datasetPath;
+      return this;
+    }
+
     /**
      * Builds an instance of {@link HnswIndexParams}.
      *
@@ -289,7 +336,8 @@ public HnswIndexParams build() {
           vectorDimension,
           m,
           metric,
-          aceParams);
+          aceParams,
+          datasetPath);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java
new file mode 100644
index 0000000000..efbcc8acc1
--- /dev/null
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java
@@ -0,0 +1,134 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+/**
+ * Parameters for materializing a layered HNSW artifact into a standard hnswlib
+ * index file on disk.
+ *
+ * @since 26.08
+ */
+public class HnswMaterializeParams {
+
+  private final String datasetPath;
+  private final double maxHostMemoryGb;
+  private final int numThreads;
+
+  private HnswMaterializeParams(String datasetPath, double maxHostMemoryGb, int numThreads) {
+    this.datasetPath = datasetPath;
+    this.maxHostMemoryGb = maxHostMemoryGb;
+    this.numThreads = numThreads;
+  }
+
+  /**
+   * Gets the local dataset path holding the original-ID-ordered vectors used to
+   * build the artifact.
+   *
+   * @return the dataset path, or null if not set
+   */
+  public String getDatasetPath() {
+    return datasetPath;
+  }
+
+  /**
+   * Gets the upper bound on host memory (in GiB) used for the base-topology
+   * reorder buffer. When {@code <= 0}, the whole base topology is reordered in a
+   * single in-memory pass.
+   *
+   * @return the max host memory in GiB (0 means a single in-memory pass)
+   */
+  public double getMaxHostMemoryGb() {
+    return maxHostMemoryGb;
+  }
+
+  /**
+   * Gets the number of host threads to use. When 0, the maximum number of
+   * threads is used.
+   *
+   * @return the number of threads
+   */
+  public int getNumThreads() {
+    return numThreads;
+  }
+
+  @Override
+  public String toString() {
+    return "HnswMaterializeParams [datasetPath="
+        + datasetPath
+        + ", maxHostMemoryGb="
+        + maxHostMemoryGb
+        + ", numThreads="
+        + numThreads
+        + "]";
+  }
+
+  /**
+   * Builder configures and creates an instance of {@link HnswMaterializeParams}.
+   */
+  public static class Builder {
+
+    private String datasetPath;
+    private double maxHostMemoryGb = 0;
+    private int numThreads = 0;
+
+    /**
+     * Constructs this Builder.
+     */
+    public Builder() {}
+
+    /**
+     * Sets the local dataset path holding the original-ID-ordered vectors used to
+     * build the artifact. Supported formats match layered deserialization:
+     * {@code .npy} and ANN benchmark {@code *.bin} files with a
+     * {@code [uint32 rows, uint32 cols]} header ({@code .fbin}, {@code .f16bin},
+     * {@code .u8bin}, {@code .i8bin}).
+     *
+     * @param datasetPath the local dataset path
+     * @return an instance of Builder
+     */
+    public Builder withDatasetPath(String datasetPath) {
+      this.datasetPath = datasetPath;
+      return this;
+    }
+
+    /**
+     * Sets the upper bound on host memory (in GiB) used for the base-topology
+     * reorder buffer.
+     *
+     * When {@code <= 0} (default), the whole base topology is reordered in a single
+     * in-memory pass (no temporary files). When set, the base topology is reordered
+     * through bucketed temporary files so that peak host memory stays close to this
+     * budget.
+     *
+     * @param maxHostMemoryGb the max host memory in GiB
+     * @return an instance of Builder
+     */
+    public Builder withMaxHostMemoryGb(double maxHostMemoryGb) {
+      this.maxHostMemoryGb = maxHostMemoryGb;
+      return this;
+    }
+
+    /**
+     * Sets the number of host threads to use. When 0 (default), the maximum number
+     * of threads is used.
+     *
+     * @param numThreads the number of threads
+     * @return an instance of Builder
+     */
+    public Builder withNumThreads(int numThreads) {
+      this.numThreads = numThreads;
+      return this;
+    }
+
+    /**
+     * Builds an instance of {@link HnswMaterializeParams}.
+     *
+     * @return an instance of {@link HnswMaterializeParams}
+     */
+    public HnswMaterializeParams build() {
+      return new HnswMaterializeParams(datasetPath, maxHostMemoryGb, numThreads);
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
index c39578755c..9ba6a34475 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
@@ -135,6 +135,28 @@ HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources)
   HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
       throws Throwable;
 
+  /**
+   * Materializes a layered HNSW artifact into a standard hnswlib index file on disk.
+   *
+   * @param resources The CuVS resources
+   * @param materializeParams Materialization parameters (dataset path, host-memory budget, threads)
+   * @param layeredArtifactPath Path to the layered HNSW artifact
+   * @param outputPath Path to the hnswlib index file to write
+   * @param dim The dimension of the vectors in the index
+   * @param metric The distance metric used to build the index
+   * @throws Throwable if an error occurs during materialization
+   */
+  default void hnswMaterializeToHnswlib(
+      CuVSResources resources,
+      HnswMaterializeParams materializeParams,
+      String layeredArtifactPath,
+      String outputPath,
+      int dim,
+      HnswIndexParams.CuvsDistanceType metric)
+      throws Throwable {
+    throw new UnsupportedOperationException();
+  }
+
   /** Creates a new TieredIndex Builder. */
   TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources)
       throws UnsupportedOperationException;
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index 950504bc5a..573699a39d 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -160,6 +160,27 @@ public void close() {
     }
   }
 
+  static CloseableHandle createHnswMaterializeParamsNative() {
+    try (var localArena = Arena.ofConfined()) {
+      var paramsPtrPtr = localArena.allocate(cuvsHnswMaterializeParams_t);
+      checkCuVSError(
+          cuvsHnswMaterializeParamsCreate(paramsPtrPtr), "cuvsHnswMaterializeParamsCreate");
+      var paramsPtr = paramsPtrPtr.get(cuvsHnswMaterializeParams_t, 0L);
+      return new CloseableHandle() {
+        @Override
+        public MemorySegment handle() {
+          return paramsPtr;
+        }
+
+        @Override
+        public void close() {
+          checkCuVSError(
+              cuvsHnswMaterializeParamsDestroy(paramsPtr), "cuvsHnswMaterializeParamsDestroy");
+        }
+      };
+    }
+  }
+
   static CloseableHandle createTieredIndexParams() {
     try (var localArena = Arena.ofConfined()) {
       var paramsPtrPtr = localArena.allocate(cuvsTieredIndexParams_t);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
index ca528ac010..c6e241180b 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
@@ -6,6 +6,7 @@
 
 import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswAceParamsNative;
 import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswIndexParams;
+import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswMaterializeParamsNative;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG;
 import static com.nvidia.cuvs.internal.common.Util.buildMemorySegment;
@@ -19,6 +20,7 @@
 import com.nvidia.cuvs.HnswAceParams;
 import com.nvidia.cuvs.HnswIndex;
 import com.nvidia.cuvs.HnswIndexParams;
+import com.nvidia.cuvs.HnswMaterializeParams;
 import com.nvidia.cuvs.HnswQuery;
 import com.nvidia.cuvs.HnswSearchParams;
 import com.nvidia.cuvs.SearchResults;
@@ -27,6 +29,7 @@
 import com.nvidia.cuvs.internal.panama.cuvsHnswAceParams;
 import com.nvidia.cuvs.internal.panama.cuvsHnswIndex;
 import com.nvidia.cuvs.internal.panama.cuvsHnswIndexParams;
+import com.nvidia.cuvs.internal.panama.cuvsHnswMaterializeParams;
 import com.nvidia.cuvs.internal.panama.cuvsHnswSearchParams;
 import java.io.InputStream;
 import java.lang.foreign.Arena;
@@ -197,7 +200,14 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable {
 
       cuvsHnswIndex.dtype(indexReference.memorySegment, dtype);
 
-      try (var params = segmentFromIndexParams(hnswIndexParams);
+      // The metric only affects loading for the layered (GPU_LAYERED_ON_DISK) hierarchy, where the
+      // native side validates it against the artifact header. For the CPU/GPU/NONE hnswlib paths
+      // the metric is otherwise unused at load time.
+      int deserializeMetric =
+          hnswIndexParams.getHierarchy() == HnswIndexParams.CuvsHnswHierarchy.GPU_LAYERED_ON_DISK
+              ? hnswIndexParams.getMetric().value
+              : 0;
+      try (var params = segmentFromIndexParams(localArena, hnswIndexParams);
           var cuvsResourcesAccessor = resources.access()) {
         checkCuVSError(
             cuvsHnswDeserialize(
@@ -205,7 +215,7 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable {
                 params.handle(),
                 pathSeg,
                 hnswIndexParams.getVectorDimension(),
-                0,
+                deserializeMetric,
                 indexReference.memorySegment),
             "cuvsHnswDeserialize");
       }
@@ -218,13 +228,21 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable {
   }
 
   /**
-   * Allocates the configured search parameters in the MemorySegment.
+   * Allocates the configured index parameters in the MemorySegment.
+   *
+   * The {@code dataset_path} string (when set) is allocated from {@code arena}, so
+   * the caller must keep {@code arena} alive until the native call that consumes
+   * the params has returned.
    */
-  private CloseableHandle segmentFromIndexParams(HnswIndexParams params) {
+  private CloseableHandle segmentFromIndexParams(Arena arena, HnswIndexParams params) {
     var hnswParams = createHnswIndexParams();
     cuvsHnswIndexParams.hierarchy(hnswParams.handle(), params.getHierarchy().value);
     cuvsHnswIndexParams.ef_construction(hnswParams.handle(), params.getEfConstruction());
     cuvsHnswIndexParams.num_threads(hnswParams.handle(), params.getNumThreads());
+    String datasetPath = params.getDatasetPath();
+    if (datasetPath != null) {
+      cuvsHnswIndexParams.dataset_path(hnswParams.handle(), arena.allocateFrom(datasetPath));
+    }
     return hnswParams;
   }
 
@@ -288,6 +306,56 @@ public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParam
     return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams);
   }
 
+  /**
+   * Materializes a layered HNSW artifact into a standard hnswlib index file on disk.
+   *
+   * @param resources            the CuVS resources
+   * @param materializeParams    the materialization parameters
+   * @param layeredArtifactPath  path to the layered HNSW artifact
+   * @param outputPath           path to the hnswlib index file to write
+   * @param dim                  the dimension of the vectors in the index
+   * @param metric               the distance metric used to build the index
+   * @throws Throwable if an error occurs during materialization
+   */
+  public static void materializeToHnswlib(
+      CuVSResources resources,
+      HnswMaterializeParams materializeParams,
+      String layeredArtifactPath,
+      String outputPath,
+      int dim,
+      HnswIndexParams.CuvsDistanceType metric)
+      throws Throwable {
+    Objects.requireNonNull(resources);
+    Objects.requireNonNull(materializeParams);
+    Objects.requireNonNull(layeredArtifactPath);
+    Objects.requireNonNull(outputPath);
+    Objects.requireNonNull(metric);
+
+    try (var localArena = Arena.ofConfined();
+        var paramsHandle = createHnswMaterializeParamsNative()) {
+      MemorySegment paramsSeg = paramsHandle.handle();
+
+      String datasetPath = materializeParams.getDatasetPath();
+      if (datasetPath != null) {
+        cuvsHnswMaterializeParams.dataset_path(paramsSeg, localArena.allocateFrom(datasetPath));
+      }
+      cuvsHnswMaterializeParams.max_host_memory_gb(
+          paramsSeg, materializeParams.getMaxHostMemoryGb());
+      cuvsHnswMaterializeParams.num_threads(paramsSeg, materializeParams.getNumThreads());
+
+      MemorySegment artifactSeg = buildMemorySegment(localArena, layeredArtifactPath);
+      MemorySegment outputSeg = buildMemorySegment(localArena, outputPath);
+
+      try (var resourcesAccessor = resources.access()) {
+        var cuvsRes = resourcesAccessor.handle();
+        int returnValue =
+            cuvsHnswMaterializeToHnswlib(
+                cuvsRes, paramsSeg, artifactSeg, outputSeg, dim, metric.value);
+        checkCuVSError(returnValue, "cuvsHnswMaterializeToHnswlib");
+      }
+    }
+  }
+
   private static CloseableHandle createHnswIndexParamsForBuild(Arena arena, HnswIndexParams params) {
     var hnswParams = createHnswIndexParams();
     MemorySegment seg = hnswParams.handle();
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index 1d3199f26f..b7b03c8ce5 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -260,6 +260,19 @@ public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswPar
     return HnswIndexImpl.build(resources, hnswParams, dataset);
   }
 
+  @Override
+  public void hnswMaterializeToHnswlib(
+      CuVSResources resources,
+      HnswMaterializeParams materializeParams,
+      String layeredArtifactPath,
+      String outputPath,
+      int dim,
+      HnswIndexParams.CuvsDistanceType metric)
+      throws Throwable {
+    HnswIndexImpl.materializeToHnswlib(
+        resources, materializeParams, layeredArtifactPath, outputPath, dim, metric);
+  }
+
   @Override
   public TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources) {
     return TieredIndexImpl.newBuilder(Objects.requireNonNull(cuVSResources));
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
index f91835b7c5..3e8541b603 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py
+++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
@@ -7,11 +7,13 @@
     ExtendParams,
     Index,
     IndexParams,
+    MaterializeParams,
     SearchParams,
     build,
     extend,
     from_cagra,
     load,
+    materialize_to_hnswlib,
     save,
     search,
 )
@@ -21,10 +23,12 @@
     "IndexParams",
     "Index",
     "ExtendParams",
+    "MaterializeParams",
     "build",
     "extend",
     "SearchParams",
     "load",
+    "materialize_to_hnswlib",
     "save",
     "search",
     "from_cagra",
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
index 9ffb295ad3..1dff7a0809 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -20,6 +20,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
         NONE
         CPU
         GPU
+        GPU_LAYERED_ON_DISK
 
     ctypedef struct cuvsHnswAceParams:
         size_t npartitions
@@ -41,6 +42,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
         size_t M
         cuvsDistanceType metric
         cuvsHnswAceParams_t ace_params
+        const char* dataset_path
 
     ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t
 
@@ -105,3 +107,24 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
                                     int32_t dim,
                                     cuvsDistanceType metric,
                                     cuvsHnswIndex_t index) except +
+
+    ctypedef struct cuvsHnswMaterializeParams:
+        const char* dataset_path
+        double max_host_memory_gb
+        int num_threads
+
+    ctypedef cuvsHnswMaterializeParams* cuvsHnswMaterializeParams_t
+
+    cuvsError_t cuvsHnswMaterializeParamsCreate(
+        cuvsHnswMaterializeParams_t* params)
+
+    cuvsError_t cuvsHnswMaterializeParamsDestroy(
+        cuvsHnswMaterializeParams_t params)
+
+    cuvsError_t cuvsHnswMaterializeToHnswlib(
+        cuvsResources_t res,
+        cuvsHnswMaterializeParams_t params,
+        const char* layered_artifact_path,
+        const char* output_path,
+        int32_t dim,
+        cuvsDistanceType metric) except +
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index 6757695de3..bff11001d0 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -124,10 +124,15 @@ cdef class IndexParams:
     Parameters
     ----------
     hierarchy : string, default = "gpu" (optional)
-        The hierarchy of the HNSW index. Valid values are ["none", "cpu", "gpu"].
+        The hierarchy of the HNSW index. Valid values are
+        ["none", "cpu", "gpu", "gpu_layered_on_disk"].
         - "none": No hierarchy is built.
         - "cpu": Hierarchy is built using CPU.
         - "gpu": Hierarchy is built using GPU.
+        - "gpu_layered_on_disk": The index artifact stores graph topology only
+          (built on the GPU and stored as a layered on-disk artifact). When
+          loading such an artifact with `load()`, `dataset_path` must point to
+          the original-ID-ordered vectors used to reconstruct the index.
     ef_construction : int, default = 200 (optional)
         Maximum number of candidate list size used during construction
         when hierarchy is `cpu`.
@@ -148,14 +153,22 @@ cdef class IndexParams:
     ace_params : AceParams, default = None (optional)
         ACE parameters for building HNSW index using ACE algorithm. If set,
         enables the build() function to use ACE for index construction.
+    dataset_path : string, default = None (optional)
+        Local dataset path used by layered HNSW deserialization. Required when
+        `hierarchy == "gpu_layered_on_disk"`: the artifact stores graph
+        topology only, and `load()` reads the original-ID-ordered vectors from
+        this path to reconstruct an in-memory HNSW index. Ignored for all other
+        hierarchies.
     """
 
     cdef cuvsHnswIndexParams* params
     cdef AceParams _ace_params
+    cdef object _dataset_path_bytes
 
     def __cinit__(self):
         check_cuvs(cuvsHnswIndexParamsCreate(&self.params))
         self._ace_params = None
+        self._dataset_path_bytes = None
 
     def __dealloc__(self):
         check_cuvs(cuvsHnswIndexParamsDestroy(self.params))
@@ -166,16 +179,20 @@ cdef class IndexParams:
                  num_threads=0,
                  M=32,
                  metric="sqeuclidean",
-                 ace_params=None):
+                 ace_params=None,
+                 dataset_path=None):
         if hierarchy == "none":
             self.params.hierarchy = cuvsHnswHierarchy.NONE
         elif hierarchy == "cpu":
             self.params.hierarchy = cuvsHnswHierarchy.CPU
         elif hierarchy == "gpu":
             self.params.hierarchy = cuvsHnswHierarchy.GPU
+        elif hierarchy == "gpu_layered_on_disk":
+            self.params.hierarchy = cuvsHnswHierarchy.GPU_LAYERED_ON_DISK
         else:
             raise ValueError("Invalid hierarchy type."
-                             " Valid values are 'none', 'cpu', and 'gpu'.")
+                             " Valid values are 'none', 'cpu', 'gpu', and"
+                             " 'gpu_layered_on_disk'.")
         self.params.ef_construction = ef_construction
         self.params.num_threads = num_threads
         self.params.M = M
@@ -189,6 +206,12 @@ cdef class IndexParams:
         else:
             self.params.ace_params = NULL
 
+        if dataset_path is not None:
+            self._dataset_path_bytes = dataset_path.encode('utf-8')
+            self.params.dataset_path = self._dataset_path_bytes
+        else:
+            self.params.dataset_path = NULL
+
     @property
     def hierarchy(self):
         if self.params.hierarchy == cuvsHnswHierarchy.NONE:
@@ -197,6 +220,8 @@ cdef class IndexParams:
             return "cpu"
         elif self.params.hierarchy == cuvsHnswHierarchy.GPU:
             return "gpu"
+        elif self.params.hierarchy == cuvsHnswHierarchy.GPU_LAYERED_ON_DISK:
+            return "gpu_layered_on_disk"
 
     @property
     def ef_construction(self):
@@ -214,6 +239,12 @@ cdef class IndexParams:
     def ace_params(self):
         return self._ace_params
 
+    @property
+    def dataset_path(self):
+        if self.params.dataset_path is not NULL:
+            return self.params.dataset_path.decode('utf-8')
+        return None
+
 
 cdef class Index:
     """
@@ -270,6 +301,68 @@ cdef class ExtendParams:
         return self.params.num_threads
 
 
+cdef class MaterializeParams:
+    """
+    Parameters for materializing a layered HNSW artifact into an hnswlib
+    index on disk.
+
+    Parameters
+    ----------
+    dataset_path : string, default = None (optional)
+        Local dataset path holding the original-ID-ordered vectors used to
+        build the artifact. Supported formats match layered deserialization:
+        `.npy` and ANN benchmark `*.bin` files with a
+        `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`,
+        `.i8bin`).
+    max_host_memory_gb : float, default = 0 (optional)
+        Upper bound on host memory (in GiB) used for the base-topology reorder
+        buffer. When <= 0, the whole base topology is reordered in a single
+        in-memory pass (no temporary files). When set, the base topology is
+        reordered through bucketed temporary files so that peak host memory
+        stays close to this budget.
+    num_threads : int, default = 0 (optional)
+        Number of host threads to use. When 0, the maximum number of threads
+        is used.
+    """
+
+    cdef cuvsHnswMaterializeParams* params
+    cdef object _dataset_path_bytes
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswMaterializeParamsCreate(&self.params))
+        self._dataset_path_bytes = None
+
+    def __dealloc__(self):
+        if self.params is not NULL:
+            check_cuvs(cuvsHnswMaterializeParamsDestroy(self.params))
+
+    def __init__(self, *,
+                 dataset_path=None,
+                 max_host_memory_gb=0,
+                 num_threads=0):
+        if dataset_path is not None:
+            self._dataset_path_bytes = dataset_path.encode('utf-8')
+            self.params.dataset_path = self._dataset_path_bytes
+        else:
+            self.params.dataset_path = NULL
+        self.params.max_host_memory_gb = max_host_memory_gb
+        self.params.num_threads = num_threads
+
+    @property
+    def dataset_path(self):
+        if self.params.dataset_path is not NULL:
+            return self.params.dataset_path.decode('utf-8')
+        return None
+
+    @property
+    def max_host_memory_gb(self):
+        return self.params.max_host_memory_gb
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
 def save(filename, Index index, resources=None):
     """
@@ -406,6 +499,84 @@ def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean",
     return idx
 
 
+@auto_sync_resources
+def materialize_to_hnswlib(MaterializeParams materialize_params,
+                           layered_artifact_path,
+                           output_path,
+                           dim,
+                           metric="sqeuclidean",
+                           resources=None):
+    """
+    Materialize a layered HNSW artifact into a standard hnswlib index file
+    on disk.
+
+    Materializes a `gpu_layered_on_disk` artifact (graph topology only, stored
+    in ACE order) plus a local dataset into a standard hnswlib index file,
+    without ever holding the full materialized index in host memory. The
+    resulting file is compatible with the original hnswlib library and can be
+    read back through `load()` with `hierarchy="cpu"`. The element data type
+    (float32, float16, uint8, int8) is read from the artifact header.
+
+    Parameters
+    ----------
+    materialize_params : MaterializeParams
+        Materialization parameters. `dataset_path` must point to the
+        original-ID-ordered vectors used to build the artifact.
+    layered_artifact_path : string
+        Path to the layered HNSW artifact.
+    output_path : string
+        Path to the hnswlib index file to write.
+    dim : int
+        Dimensions of the training dataset.
+    metric : string denoting the metric type, default="sqeuclidean"
+        Valid values for metric: ["sqeuclidean", "inner_product"], where
+            - sqeuclidean is the euclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - inner_product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cuvs.neighbors import hnsw
+    >>> n_features = 50
+    >>> # Assume a layered artifact was produced by an ACE GPU build and the
+    >>> # original-ID-ordered vectors are stored in "dataset.fbin".
+    >>> materialize_params = hnsw.MaterializeParams(
+    ...     dataset_path="dataset.fbin"
+    ... )
+    >>> hnsw.materialize_to_hnswlib(
+    ...     materialize_params,
+    ...     "layered_artifact.cuvs",
+    ...     "index.bin",
+    ...     n_features,
+    ...     metric="sqeuclidean",
+    ... )
+    >>> # The materialized index can be loaded as a standard hnswlib index.
+    >>> index = hnsw.load(
+    ...     hnsw.IndexParams(hierarchy="cpu"),
+    ...     "index.bin",
+    ...     n_features,
+    ...     np.float32,
+    ...     "sqeuclidean",
+    ... )
+    """
+    cdef string c_artifact = layered_artifact_path.encode('utf-8')
+    cdef string c_output = output_path.encode('utf-8')
+    cdef cuvsDistanceType distance_type = DISTANCE_TYPES[metric]
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    check_cuvs(cuvsHnswMaterializeToHnswlib(
+        res,
+        materialize_params.params,
+        c_artifact.c_str(),
+        c_output.c_str(),
+        dim,
+        distance_type
+    ))
+
+
 @auto_sync_resources
 def from_cagra(IndexParams index_params, cagra.Index cagra_index,
                temporary_index_path=None, resources=None):