diff --git a/c/include/cuvs/neighbors/hnsw.h b/c/include/cuvs/neighbors/hnsw.h index 15eb1b0569..9f4c1c4445 100644 --- a/c/include/cuvs/neighbors/hnsw.h +++ b/c/include/cuvs/neighbors/hnsw.h @@ -35,7 +35,9 @@ enum cuvsHnswHierarchy { /* Full hierarchy is built using the CPU */ CPU = 1, /* Full hierarchy is built using the GPU */ - GPU = 2 + GPU = 2, + /* GPU-built hierarchy stored as a layered on-disk topology artifact */ + GPU_LAYERED_ON_DISK = 3 }; /** @@ -130,6 +132,14 @@ struct cuvsHnswIndexParams { * Set to nullptr for default behavior (from_cagra conversion). */ cuvsHnswAceParams_t ace_params; + /** + * Local dataset path used by layered HNSW deserialization. + * + * When `hierarchy == GPU_LAYERED_ON_DISK`, the index artifact stores graph topology only. + * `cuvsHnswDeserialize` loads the original-ID-ordered vectors from this local dataset path to + * reconstruct an in-memory HNSW index. Set to nullptr (default) for all other hierarchies. + */ + const char* dataset_path; }; typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t; @@ -601,6 +611,80 @@ CUVS_EXPORT cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, * @} */ +/** + * @defgroup hnsw_c_index_materialize Materialize a layered HNSW artifact to an hnswlib index + * @{ + */ + +/** + * @brief Parameters for materializing a layered HNSW artifact into an hnswlib index on disk. + */ +struct cuvsHnswMaterializeParams { + /** + * Local dataset path holding the original-ID-ordered vectors used to build the artifact. + * + * Supported formats match layered deserialization: `.npy` and ANN benchmark `*.bin` files with a + * `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`, `.i8bin`). + */ + const char* dataset_path; + /** + * Upper bound on host memory (in GiB) used for the base-topology reorder buffer. + * + * When `<= 0`, the whole base topology is reordered in a single in-memory pass (no temporary + * files). When set, the base topology is reordered through bucketed temporary files so that peak + * host memory stays close to this budget. + */ + double max_host_memory_gb; + /** Number of host threads to use. When `0`, the maximum number of threads is used. */ + int num_threads; +}; + +typedef struct cuvsHnswMaterializeParams* cuvsHnswMaterializeParams_t; + +/** + * @brief Allocate HNSW materialize params, and populate with default values + * + * @param[in] params cuvsHnswMaterializeParams_t to allocate + * @return cuvsError_t + */ +CUVS_EXPORT cuvsError_t cuvsHnswMaterializeParamsCreate(cuvsHnswMaterializeParams_t* params); + +/** + * @brief De-allocate HNSW materialize params + * + * @param[in] params cuvsHnswMaterializeParams_t to de-allocate + * @return cuvsError_t + */ +CUVS_EXPORT cuvsError_t cuvsHnswMaterializeParamsDestroy(cuvsHnswMaterializeParams_t params); + +/** + * @brief Materialize a layered HNSW artifact into a standard hnswlib index file on disk. + * + * Materializes a `GPU_LAYERED_ON_DISK` artifact (graph topology only, stored in ACE order) plus a + * local dataset into a standard hnswlib index file, without ever holding the full materialized + * index in host memory. The resulting file is compatible with the original hnswlib library and can + * be read back through `cuvsHnswDeserialize` with `hierarchy == CPU`. The element data type + * (`float`, `half`, `uint8_t` or `int8_t`) is read from the artifact header. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] params cuvsHnswMaterializeParams_t materialization parameters + * @param[in] layered_artifact_path path to the layered HNSW artifact + * @param[in] output_path path to the hnswlib index file to write + * @param[in] dim the dimension of the vectors in the index + * @param[in] metric the distance metric used to build the index + * @return cuvsError_t + */ +CUVS_EXPORT cuvsError_t cuvsHnswMaterializeToHnswlib(cuvsResources_t res, + cuvsHnswMaterializeParams_t params, + const char* layered_artifact_path, + const char* output_path, + int dim, + cuvsDistanceType metric); + +/** + * @} + */ + #ifdef __cplusplus } #endif diff --git a/c/src/neighbors/hnsw.cpp b/c/src/neighbors/hnsw.cpp index c69eda0ca0..c791c004a3 100644 --- a/c/src/neighbors/hnsw.cpp +++ b/c/src/neighbors/hnsw.cpp @@ -143,6 +143,8 @@ void* _deserialize(cuvsResources_t res, cuvs::neighbors::hnsw::index* index = nullptr; auto cpp_params = cuvs::neighbors::hnsw::index_params(); cpp_params.hierarchy = static_cast(params->hierarchy); + // Required by GPU_LAYERED_ON_DISK deserialization to locate the original-ID-ordered vectors. + if (params->dataset_path != nullptr) { cpp_params.dataset_path = std::string(params->dataset_path); } auto metric_type = static_cast(metric); cuvs::neighbors::hnsw::deserialize( *res_ptr, cpp_params, std::string(filename), dim, metric_type, &index); @@ -174,7 +176,8 @@ extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params) .num_threads = 0, .M = 32, .metric = L2Expanded, - .ace_params = nullptr}; + .ace_params = nullptr, + .dataset_path = nullptr}; }); } @@ -404,3 +407,41 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, } }); } + +extern "C" cuvsError_t cuvsHnswMaterializeParamsCreate(cuvsHnswMaterializeParams_t* params) +{ + return cuvs::core::translate_exceptions([=] { + *params = new cuvsHnswMaterializeParams{ + .dataset_path = nullptr, .max_host_memory_gb = 0, .num_threads = 0}; + }); +} + +extern "C" cuvsError_t cuvsHnswMaterializeParamsDestroy(cuvsHnswMaterializeParams_t params) +{ + return cuvs::core::translate_exceptions([=] { delete params; }); +} + +extern "C" cuvsError_t cuvsHnswMaterializeToHnswlib(cuvsResources_t res, + cuvsHnswMaterializeParams_t params, + const char* layered_artifact_path, + const char* output_path, + int dim, + cuvsDistanceType metric) +{ + return cuvs::core::translate_exceptions([=] { + auto res_ptr = reinterpret_cast(res); + auto cpp_params = cuvs::neighbors::hnsw::materialize_params(); + if (params->dataset_path != nullptr) { + cpp_params.dataset_path = std::string(params->dataset_path); + } + cpp_params.max_host_memory_gb = params->max_host_memory_gb; + cpp_params.num_threads = params->num_threads; + auto metric_type = static_cast(metric); + cuvs::neighbors::hnsw::materialize_to_hnswlib(*res_ptr, + cpp_params, + std::string(layered_artifact_path), + std::string(output_path), + dim, + metric_type); + }); +} diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu index 26028b6d98..65adc926f8 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu @@ -1,9 +1,10 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include "../common/ann_types.hpp" +#include "../common/conf.hpp" #include "cuvs_ann_bench_param_parser.h" #include "cuvs_cagra_hnswlib_wrapper.h" @@ -27,6 +28,8 @@ auto parse_build_param(const nlohmann::json& conf) -> hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU; } else if (conf.at("hierarchy") == "gpu") { hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU; + } else if (conf.at("hierarchy") == "gpu_layered_on_disk") { + hnsw_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; } else { THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get().c_str()); } @@ -36,6 +39,11 @@ auto parse_build_param(const nlohmann::json& conf) -> if (conf.contains("ef_construction")) { hnsw_params.ef_construction = conf.at("ef_construction"); } + if (conf.contains("dataset_path")) { + hnsw_params.dataset_path = conf.at("dataset_path"); + } else if (hnsw_params.hierarchy == cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) { + hnsw_params.dataset_path = configuration::singleton().get_dataset_conf().base_file; + } if (conf.contains("num_threads")) { hnsw_params.num_threads = conf.at("num_threads"); } // Reuse the CAGRA wrapper params parser @@ -55,16 +63,18 @@ auto parse_build_param(const nlohmann::json& conf) -> cuvs::neighbors::cagra::hnsw_heuristic_type::SAME_GRAPH_FOOTPRINT, dist_type); ps.metric = dist_type; - // Parse ACE parameters if provided - if (conf.contains("npartitions") || conf.contains("build_dir") || - conf.contains("ef_construction") || conf.contains("use_disk")) { + // Parse ACE parameters if provided. + auto ace_conf = collect_conf_with_prefix(conf, "ace_"); + if (!ace_conf.empty()) { auto ace_params = cuvs::neighbors::cagra::graph_build_params::ace_params(); - if (conf.contains("npartitions")) { ace_params.npartitions = conf.at("npartitions"); } - if (conf.contains("build_dir")) { ace_params.build_dir = conf.at("build_dir"); } - if (conf.contains("ef_construction")) { - ace_params.ef_construction = conf.at("ef_construction"); + if (ace_conf.contains("npartitions")) { + ace_params.npartitions = ace_conf.at("npartitions"); + } + if (ace_conf.contains("build_dir")) { ace_params.build_dir = ace_conf.at("build_dir"); } + if (ace_conf.contains("ef_construction")) { + ace_params.ef_construction = ace_conf.at("ef_construction"); } - if (conf.contains("use_disk")) { ace_params.use_disk = conf.at("use_disk"); } + if (ace_conf.contains("use_disk")) { ace_params.use_disk = ace_conf.at("use_disk"); } ps.graph_build_params = ace_params; } // NB: above, we only provide the defaults. Below we parse the explicit parameters as usual. diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h index 2f0c54e1bd..8eb1b72991 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -9,10 +9,33 @@ #include #include +#include #include namespace cuvs::bench { +inline void move_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst) +{ + std::error_code ec; + if (src == dst || + (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) { + return; + } + if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); } + if (std::filesystem::exists(dst, ec)) { std::filesystem::remove(dst, ec); } + + std::filesystem::rename(src, dst, ec); + if (ec) { + // Rename fails across filesystems. Fall back to copy followed by removal of the source. + ec.clear(); + std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing, ec); + const auto src_str = src.string(); + const auto dst_str = dst.string(); + RAFT_EXPECTS(!ec, "Failed to move '%s' to '%s'.", src_str.c_str(), dst_str.c_str()); + std::filesystem::remove(src, ec); + } +} + template class cuvs_cagra_hnswlib : public algo, public algo_gpu { public: @@ -130,18 +153,25 @@ void cuvs_cagra_hnswlib::set_search_param(const search_param_base& para template void cuvs_cagra_hnswlib::save(const std::string& file) const { + if (build_param_.hnsw_index_params.hierarchy == + cuvs::neighbors::hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK) { + const auto src_artifact = std::filesystem::path(hnsw_index_->file_path()); + RAFT_EXPECTS(!src_artifact.empty(), "Layered HNSW artifact path is not available."); + RAFT_EXPECTS(std::filesystem::exists(src_artifact), + "Layered HNSW artifact '%s' does not exist.", + src_artifact.c_str()); + + move_file_overwrite(src_artifact, std::filesystem::path(file)); + return; + } + if (cagra_ace_build_) { std::string index_filename = hnsw_index_->file_path(); RAFT_EXPECTS(!index_filename.empty(), "HNSW index file path is not available."); RAFT_EXPECTS(std::filesystem::exists(index_filename), "Index file '%s' does not exist.", index_filename.c_str()); - if (std::filesystem::exists(file)) { std::filesystem::remove(file); } - // might fail when using 2 different filesystems - std::error_code ec; - std::filesystem::rename(index_filename, file, ec); - RAFT_EXPECTS( - !ec, "Failed to rename index file '%s' to '%s'.", index_filename.c_str(), file.c_str()); + move_file_overwrite(std::filesystem::path(index_filename), std::filesystem::path(file)); } else { cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get())); } diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp index fb726fed71..bc5a58972a 100644 --- a/cpp/include/cuvs/neighbors/hnsw.hpp +++ b/cpp/include/cuvs/neighbors/hnsw.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -41,9 +42,10 @@ namespace graph_build_params = cuvs::neighbors::graph_build_params; * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index. */ enum class HnswHierarchy { - NONE, // base-layer-only index - CPU, // full index with CPU-built hierarchy - GPU // full index with GPU-built hierarchy + NONE, // base-layer-only index + CPU, // full index with CPU-built hierarchy + GPU, // full index with GPU-built hierarchy + GPU_LAYERED_ON_DISK // GPU-built hierarchy stored as layered on-disk topology }; struct index_params : cuvs::neighbors::index_params { @@ -64,6 +66,16 @@ struct index_params : cuvs::neighbors::index_params { */ size_t M = 32; + /** Local dataset path used by layered HNSW deserialization. + * + * When `hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK`, the index artifact stores graph + * topology only. `deserialize` loads vectors from this local dataset path to reconstruct an + * in-memory HNSW index. + * Currently supported local dataset formats are `.npy` and ANN benchmark `*.bin` files with a + * `[uint32 rows, uint32 cols]` header. + */ + std::string dataset_path; + /** Parameters to fine tune GPU graph building. By default we select the parameters based on * dataset shape and HNSW build parameters. You can override these parameters to fine tune the * graph building process as described in the CAGRA build docs. @@ -1234,6 +1246,75 @@ void deserialize(raft::resources const& res, * @} */ +/** + * @defgroup hnsw_cpp_index_materialize Materialize a layered HNSW artifact into an hnswlib index + * @{ + */ + +/** + * @brief Parameters for materializing a layered HNSW artifact into an hnswlib index on disk. + */ +struct materialize_params { + /** Local dataset path holding the original-ID-ordered vectors used to build the artifact. + * + * Supported formats match layered deserialization: `.npy` and ANN benchmark `*.bin` files with a + * `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`, `.i8bin`). + */ + std::string dataset_path; + + /** Upper bound on host memory (in GiB) used for the base-topology reorder buffer. + * + * When `<= 0`, the whole base topology is reordered in a single in-memory pass (no temporary + * files). When set, the base topology is reordered through bucketed temporary files so that + * peak host memory stays close to this budget, at the cost of writing and re-reading the + * (small) base-topology section once. + */ + double max_host_memory_gb = 0; + + /** Number of host threads to use. When `0`, the maximum number of threads is used. */ + int num_threads = 0; +}; + +/** + * @brief Materialize a layered HNSW artifact into a standard hnswlib index file on disk. + * + * Materializes a `GPU_LAYERED_ON_DISK` artifact (graph topology only, stored in ACE order) plus a + * local dataset into a standard hnswlib index file, without ever holding the full materialized + * index in host memory. The materialization reorders the base topology from ACE order to + * original-id order and interleaves the vectors, emitting the output with sequential disk IO. The + * resulting file is compatible with the original hnswlib library (`loadIndex`) and can be read back + * through `cuvs::neighbors::hnsw::deserialize` with `hierarchy == HnswHierarchy::CPU`. + * + * The element data type (`float`, `half`, `uint8_t` or `int8_t`) is read from the artifact header, + * so a single entry point covers all supported dtypes. + * + * @param[in] res raft resources + * @param[in] params materialization parameters (dataset path, host-memory budget, threads) + * @param[in] layered_artifact_path path to the layered HNSW artifact + * @param[in] output_path path to the hnswlib index file to write + * @param[in] dim dimensions of the training dataset + * @param[in] metric distance metric. Supported metrics ("L2Expanded", "InnerProduct") + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * hnsw::materialize_params materialize_params; + * materialize_params.dataset_path = "dataset.fbin"; + * hnsw::materialize_to_hnswlib( + * res, materialize_params, "layered_artifact.cuvs", "index.bin", dim, metric); + * @endcode + */ +void materialize_to_hnswlib(raft::resources const& res, + const materialize_params& params, + const std::string& layered_artifact_path, + const std::string& output_path, + int dim, + cuvs::distance::DistanceType metric); + +/** + * @} + */ + } // namespace hnsw } // namespace neighbors } // namespace CUVS_EXPORT cuvs diff --git a/cpp/include/cuvs/util/file_io.hpp b/cpp/include/cuvs/util/file_io.hpp index f02e9c2d7d..52d10a53e7 100644 --- a/cpp/include/cuvs/util/file_io.hpp +++ b/cpp/include/cuvs/util/file_io.hpp @@ -254,6 +254,19 @@ void write_large_file(const file_descriptor& fd, const size_t total_bytes, const uint64_t file_offset); +/** + * @brief Pre-size a file to `total_bytes` bytes. + * + * Prefers posix_fallocate (reserves blocks up-front, avoids later ENOSPC and fragmentation), but + * falls back to ftruncate on filesystems that do not support preallocation (tmpfs and some + * NFS/overlay mounts return EOPNOTSUPP/EINVAL/ENOSYS) so the operation still succeeds there. A + * `total_bytes` of 0 is a no-op. Throws on failure (uses the descriptor's path in the message). + * + * @param fd File descriptor to pre-size + * @param total_bytes Target file size in bytes + */ +void preallocate_file(const file_descriptor& fd, const size_t total_bytes); + /** * @brief Buffered output stream wrapper * diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 96ff8344d3..3edbec46af 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -51,6 +51,13 @@ namespace cuvs::neighbors::cagra::detail { constexpr double to_mib(size_t bytes) { return static_cast(bytes) / (1 << 20); } constexpr double to_gib(size_t bytes) { return static_cast(bytes) / (1 << 30); } +inline auto progress_step_10(size_t completed, size_t total) -> size_t +{ + if (total == 0) { return 100; } + const auto percent = (std::min(completed, total) * 100) / total; + return completed >= total ? 100 : (percent / 10) * 10; +} + template void check_graph_degree(size_t& intermediate_degree, size_t& graph_degree, size_t dataset_size) { @@ -99,7 +106,7 @@ void ace_get_partition_labels( const size_t min_samples = 100 * n_partitions; n_samples = std::max(n_samples, min_samples); n_samples = std::min(n_samples, dataset_size); - RAFT_LOG_DEBUG("ACE: n_samples: %lu", n_samples); + RAFT_LOG_DEBUG("ACE build: partition labeling uses %lu sampled vectors", n_samples); auto sample_db = raft::make_host_matrix(n_samples, dataset_dim); #pragma omp parallel for @@ -123,18 +130,10 @@ void ace_get_partition_labels( auto _sub_distances = raft::make_host_matrix(chunk_size, n_partitions); auto _sub_dataset_dev = raft::make_device_matrix(res, chunk_size, dataset_dim); auto _sub_distances_dev = raft::make_device_matrix(res, chunk_size, n_partitions); - size_t report_interval = dataset_size / 10; - report_interval = (report_interval / chunk_size) * chunk_size; - report_interval = std::max(report_interval, chunk_size); + size_t next_progress_percent = 10; for (size_t i_base = 0; i_base < dataset_size; i_base += chunk_size) { const size_t sub_dataset_size = std::min(chunk_size, dataset_size - i_base); - if (i_base % report_interval == 0) { - RAFT_LOG_INFO("ACE: Processing chunk %lu / %lu (%.1f%%)", - i_base, - dataset_size, - static_cast(100 * i_base) / dataset_size); - } auto sub_dataset = raft::make_host_matrix_view( _sub_dataset.data_handle(), sub_dataset_size, dataset_dim); @@ -191,6 +190,16 @@ void ace_get_partition_labels( #pragma omp atomic update partition_histogram(augmented_label, 1) += 1; } + + const auto completed_rows = i_base + sub_dataset_size; + const auto report_percent = progress_step_10(completed_rows, dataset_size); + if (report_percent >= next_progress_percent) { + RAFT_LOG_INFO("ACE build: partition labeling progress %zu%% (%zu/%zu rows)", + report_percent, + completed_rows, + dataset_size); + next_progress_percent = report_percent + 10; + } } } @@ -261,15 +270,15 @@ void ace_check_partition_sizes( if (total_count > 0 && total_count < very_small_threshold) { RAFT_LOG_WARN( - "ACE: Partition %lu is very small (%lu vectors, expected ~%.1f). This may affect graph " - "quality.", + "ACE build: partition %lu is very small (%lu vectors, expected ~%.1f); graph quality may " + "be affected", c, total_count, expected_avg_vectors); } else if (total_count > very_large_threshold) { RAFT_LOG_WARN( - "ACE: Partition %lu is very large (%lu vectors, expected ~%.1f, threshold: %lu). This may " - "indicate imbalance and can lead to memory issues in restricted environments.", + "ACE build: partition %lu is very large (%lu vectors, expected ~%.1f, threshold=%lu); " + "partition imbalance may increase memory pressure", c, total_count, expected_avg_vectors, @@ -482,11 +491,10 @@ void ace_reorder_and_store_dataset( size_t dataset_dim = dataset.extent(1); size_t n_partitions = partition_histogram.extent(0); - RAFT_LOG_DEBUG( - "ACE: Reordering and storing dataset to disk (%lu vectors, %lu dimensions, %lu partitions)", - dataset_size, - dataset_dim, - n_partitions); + RAFT_LOG_DEBUG("ACE build: reordering dataset to disk (rows=%lu dim=%lu partitions=%lu)", + dataset_size, + dataset_dim, + n_partitions); // Calculate total sizes for pre-allocation size_t total_core_vectors = 0; @@ -509,10 +517,10 @@ void ace_reorder_and_store_dataset( size_t reordered_file_size = total_core_vectors * vector_size; size_t augmented_file_size = total_augmented_vectors * vector_size; - RAFT_LOG_DEBUG("ACE: Reordered dataset: %lu core vectors (%.2f GiB)", + RAFT_LOG_DEBUG("ACE build: reordered dataset section rows=%lu size=%.2f GiB", total_core_vectors, reordered_file_size / (1024.0 * 1024.0 * 1024.0)); - RAFT_LOG_DEBUG("ACE: Augmented dataset: %lu secondary vectors (%.2f GiB)", + RAFT_LOG_DEBUG("ACE build: augmented dataset section rows=%lu size=%.2f GiB", total_augmented_vectors, augmented_file_size / (1024.0 * 1024.0 * 1024.0)); @@ -541,7 +549,7 @@ void ace_reorder_and_store_dataset( disk_write_size = std::min(disk_write_size, 64 * 1024 * 1024); size_t vectors_per_buffer = std::max(64, disk_write_size / vector_size); - RAFT_LOG_DEBUG("ACE: Reorder buffers: %lu vectors per buffer (%.2f MiB)", + RAFT_LOG_DEBUG("ACE build: reorder buffer rows=%lu size=%.2f MiB", vectors_per_buffer, to_mib(vectors_per_buffer * vector_size)); @@ -593,8 +601,8 @@ void ace_reorder_and_store_dataset( } }; - size_t vectors_processed = 0; - const size_t log_interval = std::max(dataset_size / 10, size_t(1)); + size_t vectors_processed = 0; + size_t next_progress_percent = 10; for (size_t i = 0; i < dataset_size; i++) { size_t core_partition = partition_labels(i, 0); size_t secondary_partition = partition_labels(i, 1); @@ -623,16 +631,18 @@ void ace_reorder_and_store_dataset( } vectors_processed++; - if (vectors_processed % log_interval == 0) { - RAFT_LOG_INFO("ACE: Processed %lu/%lu vectors (%.1f%%)", + const auto report_percent = progress_step_10(vectors_processed, dataset_size); + if (report_percent >= next_progress_percent) { + RAFT_LOG_INFO("ACE build: dataset reorder progress %zu%% (%zu/%zu rows)", + report_percent, vectors_processed, - dataset_size, - 100.0 * vectors_processed / dataset_size); + dataset_size); + next_progress_percent = report_percent + 10; } } // Flush all remaining buffers - RAFT_LOG_DEBUG("ACE: Flushing remaining buffers..."); + RAFT_LOG_DEBUG("ACE build: flushing remaining reorder buffers"); #pragma omp parallel sections { #pragma omp section @@ -662,12 +672,12 @@ void ace_reorder_and_store_dataset( elapsed_ms > 0 ? to_mib(total_bytes_written) / (elapsed_ms / 1000.0) : 0.0; RAFT_LOG_INFO( - "ACE: Dataset (%.2f GiB reordered, %.2f GiB augmented, %.2f GiB mapping) reordering completed " - "in %ld ms (%.1f MiB/s)", + "ACE build: dataset reorder completed in %ld ms (reordered=%.2f GiB augmented=%.2f GiB " + "mapping=%.2f GiB, %.1f MiB/s)", + elapsed_ms, reordered_file_size / (1024.0 * 1024.0 * 1024.0), augmented_file_size / (1024.0 * 1024.0 * 1024.0), mapping_file_size / (1024.0 * 1024.0 * 1024.0), - elapsed_ms, throughput_mb_s); } @@ -685,17 +695,17 @@ void ace_load_partition_dataset_from_disk( { size_t n_partitions = partition_histogram.extent(0); - RAFT_LOG_DEBUG("ACE: Loading partition %lu dataset from disk", partition_id); + RAFT_LOG_DEBUG("ACE build: loading partition %lu dataset from disk", partition_id); size_t core_size = partition_histogram(partition_id, 0); size_t augmented_size = partition_histogram(partition_id, 1); size_t total_partition_size = core_size + augmented_size; - RAFT_LOG_DEBUG("ACE: Partition %lu: %lu core + %lu augmented = %lu total vectors", + RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)", partition_id, + total_partition_size, core_size, - augmented_size, - total_partition_size); + augmented_size); RAFT_EXPECTS(static_cast(sub_dataset.extent(0)) == total_partition_size, "sub_dataset rows (%lu) must match total partition size (%lu)", @@ -712,10 +722,10 @@ void ace_load_partition_dataset_from_disk( const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy"; if (!std::filesystem::exists(reordered_dataset_path)) { - RAFT_FAIL("ACE: Required file does not exist: %s", reordered_dataset_path.c_str()); + RAFT_FAIL("ACE build: required file does not exist: %s", reordered_dataset_path.c_str()); } if (!std::filesystem::exists(augmented_dataset_path)) { - RAFT_FAIL("ACE: Required file does not exist: %s", augmented_dataset_path.c_str()); + RAFT_FAIL("ACE build: required file does not exist: %s", augmented_dataset_path.c_str()); } size_t core_header_size = 0; @@ -748,7 +758,7 @@ void ace_load_partition_dataset_from_disk( core_file_offset += core_header_size; augmented_file_offset += augmented_header_size; - RAFT_LOG_DEBUG("ACE: Core file offset: %lu bytes, Augmented file offset: %lu bytes", + RAFT_LOG_DEBUG("ACE build: core file offset=%lu bytes augmented file offset=%lu bytes", core_file_offset, augmented_file_offset); @@ -763,7 +773,7 @@ void ace_load_partition_dataset_from_disk( try { if (core_size > 0) { RAFT_LOG_DEBUG( - "ACE: Reading %lu core vectors from offset %lu", core_size, core_file_offset); + "ACE build: reading %lu core vectors from offset %lu", core_size, core_file_offset); cuvs::util::file_descriptor reordered_fd(reordered_dataset_path, O_RDONLY); const size_t core_bytes = core_size * vector_size; cuvs::util::read_large_file( @@ -777,7 +787,7 @@ void ace_load_partition_dataset_from_disk( { try { if (augmented_size > 0) { - RAFT_LOG_DEBUG("ACE: Reading %lu augmented vectors from offset %lu", + RAFT_LOG_DEBUG("ACE build: reading %lu augmented vectors from offset %lu", augmented_size, augmented_file_offset); cuvs::util::file_descriptor augmented_fd(augmented_dataset_path, O_RDONLY); @@ -879,7 +889,8 @@ bool ace_check_use_disk_mode(bool use_disk, // Use overridden memory limits if provided (> 0), otherwise query actual system memory if (max_host_memory_gb.has_value() && max_host_memory_gb.value() > 0) { mem.available_host_memory = static_cast(max_host_memory_gb.value() * (1ULL << 30)); - RAFT_LOG_INFO("ACE: Using overridden host memory limit: %.2f GiB", max_host_memory_gb.value()); + RAFT_LOG_INFO("ACE build: using overridden host memory limit %.2f GiB", + max_host_memory_gb.value()); } else { mem.available_host_memory = cuvs::util::get_free_host_memory(); } @@ -903,7 +914,7 @@ bool ace_check_use_disk_mode(bool use_disk, mem.total_size = mem.partition_labels_size + mem.id_mapping_size + mem.sub_dataset_size + mem.sub_graph_size + mem.cagra_graph_size; - RAFT_LOG_INFO("ACE: Estimated host memory required: %.2f GiB, available: %.2f GiB", + RAFT_LOG_INFO("ACE build: host memory estimate required=%.2f GiB available=%.2f GiB", to_gib(mem.total_size), to_gib(mem.available_host_memory)); @@ -915,7 +926,8 @@ bool ace_check_use_disk_mode(bool use_disk, // TODO: Extend model or use managed memory if running out of GPU memory. if (max_gpu_memory_gb.has_value() && max_gpu_memory_gb.value() > 0) { mem.available_gpu_memory = static_cast(max_gpu_memory_gb.value() * (1ULL << 30)); - RAFT_LOG_INFO("ACE: Using overridden GPU memory limit: %.2f GiB", max_gpu_memory_gb.value()); + RAFT_LOG_INFO("ACE build: using overridden GPU memory limit %.2f GiB", + max_gpu_memory_gb.value()); } else { mem.available_gpu_memory = rmm::available_device_memory().second; } @@ -923,7 +935,7 @@ bool ace_check_use_disk_mode(bool use_disk, static_cast(usable_gpu_memory_fraction * mem.available_gpu_memory) < std::max(mem.sub_graph_size, mem.sub_dataset_size); - RAFT_LOG_INFO("ACE: Estimated GPU memory required: %.2f GiB, available: %.2f GiB", + RAFT_LOG_INFO("ACE build: GPU memory estimate required=%.2f GiB available=%.2f GiB", to_gib(mem.cagra_graph_size), to_gib(mem.available_gpu_memory)); @@ -934,7 +946,7 @@ bool ace_check_use_disk_mode(bool use_disk, valid_build_dir &= build_dir.find('\0') == std::string::npos; valid_build_dir &= build_dir.find("//") == std::string::npos; if (!valid_build_dir) { - RAFT_LOG_WARN("ACE: Invalid build_dir path, resetting to default: /tmp/ace_build"); + RAFT_LOG_WARN("ACE build: invalid build_dir path, resetting to /tmp/ace_build"); build_dir = "/tmp/ace_build"; } if (mkdir(build_dir.c_str(), 0755) != 0 && errno != EEXIST) { @@ -943,24 +955,19 @@ bool ace_check_use_disk_mode(bool use_disk, } if (host_memory_limited && gpu_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in host and GPU memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in host or GPU memory; using disk mode at %s", + build_dir.c_str()); } else if (host_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in host memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in host memory; using disk mode at %s", + build_dir.c_str()); } else if (gpu_memory_limited) { - RAFT_LOG_INFO( - "ACE: Graph does not fit in GPU memory. Using disk-mode with temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph does not fit in GPU memory; using disk mode at %s", + build_dir.c_str()); } else if (use_disk) { - RAFT_LOG_INFO( - "ACE: Graph fits in host and GPU memory but disk mode is forced. Using disk-mode with " - "temporary storage %s", - build_dir.c_str()); + RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory but disk mode is forced; using %s", + build_dir.c_str()); } else { - RAFT_LOG_INFO("ACE: Graph fits in host and GPU memory. Using in-memory mode."); + RAFT_LOG_INFO("ACE build: graph fits in host and GPU memory; using in-memory mode"); } return use_disk_mode; @@ -1005,8 +1012,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, disk_mode_host_required) { host_memory_insufficient = true; RAFT_LOG_WARN( - "ACE: Host memory insufficient for disk mode. Required: %.2f GiB, available: %.2f GiB. " - "Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, workspace %.2f GiB", + "ACE build: host memory is insufficient for disk mode: required=%.2f GiB available=%.2f " + "GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)", to_gib(disk_mode_host_required), to_gib(mem.available_host_memory), to_gib(mem.sub_dataset_size), @@ -1017,8 +1024,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, double available_for_scaling = usable_cpu_memory_fraction * mem.available_host_memory - mem.partition_labels_size - mem.id_mapping_size; RAFT_EXPECTS(available_for_scaling > 0, - "ACE: Host memory insufficient even for constant overhead (labels + id_mapping). " - "Required: %.2f GiB, available: %.2f GiB", + "ACE build: host memory is insufficient even for constant overhead " + "(labels + id_mapping): required=%.2f GiB available=%.2f GiB", to_gib(mem.partition_labels_size + mem.id_mapping_size), to_gib(usable_cpu_memory_fraction * mem.available_host_memory)); host_suggested_partitions = static_cast( @@ -1040,9 +1047,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, disk_mode_gpu_required) { gpu_memory_insufficient = true; RAFT_LOG_WARN( - "ACE: GPU memory insufficient for per-partition processing. Required: %.2f GiB, " - "available: %.2f GiB. Per-partition breakdown: dataset %.2f GiB, graph %.2f GiB, " - "workspace %.2f GiB", + "ACE build: GPU memory is insufficient for partition processing: required=%.2f GiB " + "available=%.2f GiB (partition dataset=%.2f GiB graph=%.2f GiB workspace=%.2f GiB)", to_gib(disk_mode_gpu_required), to_gib(mem.available_gpu_memory), to_gib(mem.sub_dataset_size), @@ -1060,8 +1066,7 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, size_t new_n_partitions = std::max(host_suggested_partitions, gpu_suggested_partitions); RAFT_LOG_WARN( - "ACE: Automatically increasing number of partitions from %zu to %zu to satisfy memory " - "constraints.%s%s", + "ACE build: increasing partitions from %zu to %zu to satisfy memory constraints.%s%s", original_n_partitions, new_n_partitions, host_memory_insufficient @@ -1088,8 +1093,8 @@ void ace_validate_disk_mode_partitions(size_t& n_partitions, guarantee_connectivity); RAFT_LOG_INFO( - "ACE: Updated per-partition memory estimates: dataset %.2f GiB, graph %.2f GiB, " - "host workspace %.2f GiB, GPU workspace %.2f GiB", + "ACE build: updated partition memory estimates dataset=%.2f GiB graph=%.2f GiB host " + "workspace=%.2f GiB GPU workspace=%.2f GiB", to_gib(mem.sub_dataset_size), to_gib(mem.sub_graph_size), to_gib(new_opt_host_ws), @@ -1131,15 +1136,15 @@ index build_ace(raft::resources const& res, size_t dataset_size = dataset.extent(0); size_t dataset_dim = dataset.extent(1); - RAFT_EXPECTS(dataset_size > 0, "ACE: Dataset must not be empty"); + RAFT_EXPECTS(dataset_size > 0, "ACE build: dataset must not be empty"); if (dataset_size < 1000) { - RAFT_LOG_WARN("ACE: Very small dataset size (%zu), consider using regular CAGRA build instead.", + RAFT_LOG_WARN("ACE build: very small dataset size (%zu); regular CAGRA may be simpler", dataset_size); } - RAFT_EXPECTS(dataset_dim > 0, "ACE: Dataset dimension must be greater than 0"); + RAFT_EXPECTS(dataset_dim > 0, "ACE build: dataset dimension must be greater than 0"); RAFT_EXPECTS(params.intermediate_graph_degree > 0, - "ACE: Intermediate graph degree must be greater than 0"); - RAFT_EXPECTS(params.graph_degree > 0, "ACE: Graph degree must be greater than 0"); + "ACE build: intermediate graph degree must be greater than 0"); + RAFT_EXPECTS(params.graph_degree > 0, "ACE build: graph degree must be greater than 0"); size_t n_partitions = npartitions; if (n_partitions == 0) { @@ -1152,17 +1157,22 @@ index build_ace(raft::resources const& res, n_partitions = dataset_size / min_required_per_partition; if (n_partitions < 2) { RAFT_LOG_WARN( - "ACE: Reduced number of partitions to the minimum of 2 to avoid tiny partitions. Consider " - "using regular CAGRA build instead."); + "ACE build: reduced partitions to the minimum of 2 to avoid tiny partitions; regular " + "CAGRA may be simpler"); n_partitions = 2; } else { - RAFT_LOG_WARN("ACE: Reduced number of partitions to %zu to avoid tiny partitions", - n_partitions); + RAFT_LOG_WARN("ACE build: reduced partitions to %zu to avoid tiny partitions", n_partitions); } } auto total_start = std::chrono::high_resolution_clock::now(); - RAFT_LOG_INFO("ACE: Starting partitioned CAGRA build with %zu partitions", n_partitions); + RAFT_LOG_INFO( + "ACE build: start rows=%zu dim=%zu partitions=%zu graph_degree=%zu intermediate_degree=%zu", + dataset_size, + dataset_dim, + n_partitions, + static_cast(params.graph_degree), + static_cast(params.intermediate_graph_degree)); size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1228,7 +1238,7 @@ index build_ace(raft::resources const& res, build_dir + "/cagra_graph.npy", {dataset_size, graph_degree}); RAFT_LOG_DEBUG( - "ACE: Wrote numpy headers (reordered: %zu, augmented: %zu, mapping: %zu, graph: %zu bytes)", + "ACE build: wrote numpy headers (reordered=%zu augmented=%zu mapping=%zu graph=%zu bytes)", reordered_header_size, augmented_header_size, mapping_header_size, @@ -1259,11 +1269,9 @@ index build_ace(raft::resources const& res, auto partition_elapsed = std::chrono::duration_cast(partition_end - partition_start) .count(); - RAFT_LOG_INFO( - "ACE: Partition labeling completed in %ld ms (min_partition_size: " - "%lu)", - partition_elapsed, - min_partition_size); + RAFT_LOG_INFO("ACE build: partition labeling completed in %ld ms (min_partition_size=%lu)", + partition_elapsed, + min_partition_size); // Create vector lists for each partition auto vectorlist_start = std::chrono::high_resolution_clock::now(); @@ -1288,7 +1296,7 @@ index build_ace(raft::resources const& res, auto vectorlist_elapsed = std::chrono::duration_cast(vectorlist_end - vectorlist_start) .count(); - RAFT_LOG_INFO("ACE: Vector list creation completed in %ld ms", vectorlist_elapsed); + RAFT_LOG_INFO("ACE build: partition mapping completed in %ld ms", vectorlist_elapsed); // Reorder the dataset based on partitions and store to disk. Uses write buffers to improve // performance. @@ -1319,7 +1327,7 @@ index build_ace(raft::resources const& res, // Process each partition auto partition_processing_start = std::chrono::high_resolution_clock::now(); for (size_t partition_id = 0; partition_id < n_partitions; partition_id++) { - RAFT_LOG_DEBUG("ACE: Processing partition %lu/%lu", partition_id + 1, n_partitions); + RAFT_LOG_DEBUG("ACE build: processing partition %lu/%lu", partition_id + 1, n_partitions); auto start = std::chrono::high_resolution_clock::now(); // Extract vectors for this partition @@ -1328,10 +1336,11 @@ index build_ace(raft::resources const& res, size_t sub_dataset_size = core_sub_dataset_size + augmented_sub_dataset_size; if (sub_dataset_size == 0) { - RAFT_LOG_WARN("ACE: Skipping empty partition %lu", partition_id); + RAFT_LOG_WARN("ACE build: Skipping empty partition %lu", partition_id); continue; } - RAFT_LOG_DEBUG("ACE: Sub-dataset size: %lu (%lu + %lu)", + RAFT_LOG_DEBUG("ACE build: partition %lu rows=%lu (core=%lu augmented=%lu)", + partition_id, sub_dataset_size, core_sub_dataset_size, augmented_sub_dataset_size); @@ -1444,12 +1453,12 @@ index build_ace(raft::resources const& res, ? to_mib(core_sub_dataset_size * dataset_dim * sizeof(T)) / (write_elapsed / 1000.0) : 0.0; RAFT_LOG_INFO( - "ACE: Partition %4lu (%8lu + %8lu) completed in %6ld ms: read %6ld ms (%7.1f MiB/s), " - "optimize %6ld ms, adjust %6ld ms, write %6ld ms (%7.1f MiB/s)", + "ACE build: partition %4lu completed in %6ld ms (core=%8lu augmented=%8lu read=%6ld ms " + "%.1f MiB/s optimize=%6ld ms adjust=%6ld ms write=%6ld ms %.1f MiB/s)", partition_id, + elapsed_ms, core_sub_dataset_size, augmented_sub_dataset_size, - elapsed_ms, read_elapsed, read_throughput, optimize_elapsed, @@ -1462,7 +1471,7 @@ index build_ace(raft::resources const& res, auto partition_processing_elapsed = std::chrono::duration_cast( partition_processing_end - partition_processing_start) .count(); - RAFT_LOG_INFO("ACE: All partition processing completed in %ld ms (%zu partitions)", + RAFT_LOG_INFO("ACE build: partition graph build completed in %ld ms (partitions=%zu)", partition_processing_elapsed, n_partitions); @@ -1472,7 +1481,7 @@ index build_ace(raft::resources const& res, const std::string augmented_dataset_path = build_dir + "/augmented_dataset.npy"; if (std::filesystem::exists(augmented_dataset_path)) { std::filesystem::remove(augmented_dataset_path); - RAFT_LOG_INFO("ACE: Removed augmented dataset file to save disk space"); + RAFT_LOG_INFO("ACE build: removed temporary augmented dataset"); } } @@ -1488,12 +1497,12 @@ index build_ace(raft::resources const& res, idx.update_dataset(res, dataset); } catch (std::bad_alloc& e) { RAFT_LOG_WARN( - "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be " - "stored."); + "ACE build: insufficient GPU memory to attach dataset to index; only the graph will " + "be stored"); } catch (raft::logic_error& e) { RAFT_LOG_WARN( - "Insufficient GPU memory to attach dataset to ACE index. Only the graph will be " - "stored."); + "ACE build: insufficient GPU memory to attach dataset to index; only the graph will " + "be stored"); } } } else { @@ -1501,37 +1510,36 @@ index build_ace(raft::resources const& res, idx.update_graph(res, std::move(graph_fd)); idx.update_mapping(res, std::move(mapping_fd)); - RAFT_LOG_INFO( - "ACE: Set disk storage at %s (dataset shape [%zu, %zu], graph shape [%zu, %zu])", - build_dir.c_str(), - idx.size(), - idx.dim(), - idx.size(), - idx.graph_degree()); + RAFT_LOG_INFO("ACE build: disk artifacts ready at %s (dataset=[%zu,%zu] graph=[%zu,%zu])", + build_dir.c_str(), + idx.size(), + idx.dim(), + idx.size(), + idx.graph_degree()); } auto index_creation_end = std::chrono::high_resolution_clock::now(); auto index_creation_elapsed = std::chrono::duration_cast( index_creation_end - index_creation_start) .count(); - RAFT_LOG_INFO("ACE: Final index creation completed in %ld ms", index_creation_elapsed); + RAFT_LOG_INFO("ACE build: final index initialized in %ld ms", index_creation_elapsed); auto total_end = std::chrono::high_resolution_clock::now(); auto total_elapsed = std::chrono::duration_cast(total_end - total_start).count(); - RAFT_LOG_INFO("ACE: Partitioned CAGRA build completed in %ld ms total", total_elapsed); + RAFT_LOG_INFO("ACE build: completed in %ld ms", total_elapsed); return idx; } catch (const std::exception& e) { // Clean up build directory on failure if we created it - RAFT_LOG_ERROR("ACE: Build failed with exception: %s", e.what()); + RAFT_LOG_ERROR("ACE build: failed with exception: %s", e.what()); if (cleanup_on_failure && !build_dir.empty()) { - RAFT_LOG_INFO("ACE: Cleaning up build directory: %s", build_dir.c_str()); + RAFT_LOG_INFO("ACE build: cleaning up build directory %s", build_dir.c_str()); try { std::filesystem::remove_all(build_dir); - RAFT_LOG_INFO("ACE: Successfully removed build directory"); + RAFT_LOG_INFO("ACE build: removed build directory"); } catch (const std::exception& cleanup_error) { - RAFT_LOG_WARN("ACE: Failed to clean up build directory: %s", cleanup_error.what()); + RAFT_LOG_WARN("ACE build: failed to clean up build directory: %s", cleanup_error.what()); } } // Re-throw the original exception diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 4914a0fa1b..f701ba962d 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -23,15 +23,28 @@ #include #include +#include +#include +#include +#include +#include +#include #include #include #include +#include #include +#include #include +#include +#include #include -#include +#include #include +#include #include +#include +#include namespace cuvs::neighbors::hnsw::detail { @@ -166,6 +179,9 @@ struct index_impl : index { RAFT_LOG_INFO("Loading HNSW index from disk: %s", filepath.c_str()); try { + RAFT_EXPECTS(this->hierarchy() != HnswHierarchy::GPU_LAYERED_ON_DISK, + "Layered HNSW indexes must be loaded with hnsw::deserialize so a local dataset " + "can be provided through index_params.dataset_path."); appr_alg_ = std::make_unique::type>>( space_.get(), filepath); if (this->hierarchy() == HnswHierarchy::NONE) { appr_alg_->base_layer_only = true; } @@ -341,6 +357,861 @@ void all_neighbors_graph(raft::resources const& res, } } +struct hnsw_level_plan { + size_t n_rows = 0; + std::vector hist; + // Bucket-end offsets after order construction. Level L starts at offsets[L - 1]. + std::vector offsets; + std::vector order; + std::vector order_bw; + std::vector levels; + + [[nodiscard]] auto max_level() const -> int + { + return hist.empty() ? 0 : static_cast(hist.size() - 1); + } + + [[nodiscard]] auto promoted_count() const -> size_t + { + return hist.empty() ? 0 : n_rows - hist[0]; + } +}; + +inline auto hnsw_upper_layer_degree(size_t row_count, size_t M) -> size_t +{ + return row_count > M ? M : row_count > 0 ? row_count - 1 : 0; +} + +inline auto elapsed_ms_since(std::chrono::steady_clock::time_point start) -> int64_t +{ + return std::chrono::duration_cast(std::chrono::steady_clock::now() - + start) + .count(); +} + +inline auto elapsed_ms(std::chrono::steady_clock::duration elapsed) -> double +{ + return std::chrono::duration(elapsed).count(); +} + +inline auto to_gib(size_t bytes) -> double +{ + return static_cast(bytes) / (1024.0 * 1024.0 * 1024.0); +} + +inline auto throughput_gib_per_s(size_t bytes, int64_t elapsed_ms) -> double +{ + return elapsed_ms > 0 ? to_gib(bytes) / (elapsed_ms / 1000.0) : 0.0; +} + +inline auto throughput_gib_per_s(size_t bytes, std::chrono::steady_clock::duration elapsed) + -> double +{ + const auto elapsed_s = std::chrono::duration(elapsed).count(); + return elapsed_s > 0.0 ? to_gib(bytes) / elapsed_s : 0.0; +} + +inline auto progress_step_10(size_t completed, size_t total) -> size_t +{ + if (total == 0) { return 100; } + const auto percent = (std::min(completed, total) * 100) / total; + return completed >= total ? 100 : (percent / 10) * 10; +} + +inline auto make_hnsw_level_plan_from_levels(size_t n_rows, + std::vector&& levels, + bool build_reverse_order, + const char* log_prefix = nullptr) -> hnsw_level_plan +{ + hnsw_level_plan plan; + plan.n_rows = n_rows; + plan.levels = std::move(levels); + RAFT_EXPECTS(n_rows > 0, "HNSW hierarchy requires at least one row"); + RAFT_EXPECTS(plan.levels.size() == n_rows, + "HNSW level count (%zu) must match row count (%zu)", + plan.levels.size(), + n_rows); + + for (auto level : plan.levels) { + while (static_cast(level) >= plan.hist.size()) { + plan.hist.push_back(0); + } + plan.hist[level]++; + } + + plan.offsets.resize(plan.hist.size() + 1, 0); + for (size_t i = 0; i < plan.hist.size() - 1; ++i) { + plan.offsets[i + 1] = plan.offsets[i] + plan.hist[i]; + if (log_prefix != nullptr) { + RAFT_LOG_INFO("%s%zu : %zu", log_prefix, i + 1, n_rows - plan.offsets[i + 1]); + } + } + + plan.order.resize(n_rows); + if (build_reverse_order) { plan.order_bw.resize(n_rows); } + for (size_t i = 0; i < n_rows; ++i) { + const auto level = static_cast(plan.levels[i]); + if (build_reverse_order) { plan.order_bw[i] = plan.offsets[level]; } + plan.order[plan.offsets[level]++] = i; + } + + return plan; +} + +template +auto make_random_hnsw_level_plan(size_t n_rows, HnswAlgo& appr_algo, const char* log_prefix) + -> hnsw_level_plan +{ + std::vector levels(n_rows); + for (int64_t i = 0; i < static_cast(n_rows); ++i) { + const auto pt_level = appr_algo.getRandomLevel(appr_algo.mult_); + RAFT_EXPECTS(pt_level <= std::numeric_limits::max(), + "HNSW serialization only supports levels up to %u", + static_cast(std::numeric_limits::max())); + levels[i] = static_cast(pt_level); + } + + return make_hnsw_level_plan_from_levels(n_rows, std::move(levels), true, log_prefix); +} + +template +void build_hnsw_upper_layer_graphs( + raft::resources const& res, + raft::host_matrix_view promoted_dataset, + const hnsw_level_plan& plan, + size_t M, + cuvs::distance::DistanceType metric, + Callback&& callback) +{ + static_assert(std::is_same_v, + "HNSW upper-layer graph construction expects uint32_t neighbor ids"); + + const auto dim = static_cast(promoted_dataset.extent(1)); + for (size_t pt_level = 1; pt_level < plan.hist.size(); ++pt_level) { + const auto start_idx = plan.offsets[pt_level - 1]; + const auto removed_rows = start_idx - plan.offsets[0]; + const auto row_count = plan.n_rows - start_idx; + const auto neighbor_size = hnsw_upper_layer_degree(row_count, M); + + RAFT_LOG_INFO("Layered HNSW artifact: computing upper-layer neighbors level=%zu", pt_level); + auto host_neighbors = raft::make_host_matrix( + static_cast(row_count), static_cast(neighbor_size)); + + if (neighbor_size > 0) { + auto layer_dataset_view = raft::make_host_matrix_view( + promoted_dataset.data_handle() + removed_rows * dim, + static_cast(row_count), + promoted_dataset.extent(1)); + all_neighbors_graph(res, layer_dataset_view, host_neighbors.view(), metric); + } + + callback(pt_level, start_idx, host_neighbors); + } +} + +struct layered_hnsw_layer_info { + size_t level = 0; + size_t row_count = 0; + size_t degree = 0; + size_t node_offset = 0; + size_t link_offset = 0; +}; + +struct layered_hnsw_file_metadata { + size_t n_rows = 0; + size_t dim = 0; + size_t M = 0; + size_t maxM = 0; + size_t maxM0 = 0; + size_t ef_construction = 0; + size_t base_degree = 0; + size_t levels_bytes = 0; + size_t base_nodes_bytes = 0; + size_t base_link_row_bytes = 0; + size_t base_links_bytes = 0; + size_t upper_nodes_count = 0; + size_t upper_nodes_bytes = 0; + size_t upper_link_row_bytes = 0; + size_t upper_links_bytes = 0; + double mult = 0.0; + int maxlevel = 0; + int enterpoint_node = 0; + std::vector layers; +}; + +enum class layered_hnsw_dtype : uint32_t { + unknown = 0, + float32 = 1, + float16 = 2, + uint8 = 3, + int8 = 4, +}; + +// The layered HNSW artifact begins with this fixed-size POD header, immediately followed by +// `num_layers` layered_hnsw_layer_descriptor records and then the payload sections. +struct layered_hnsw_file_header { + char magic[32]; + uint64_t n_rows; + uint64_t dim; + uint64_t M; + uint64_t maxM; + uint64_t maxM0; + uint64_t ef_construction; + uint64_t base_degree; + uint64_t levels_bytes; + uint64_t base_nodes_bytes; + uint64_t base_link_row_bytes; + uint64_t base_links_bytes; + uint64_t upper_nodes_count; + uint64_t upper_nodes_bytes; + uint64_t upper_link_row_bytes; + uint64_t upper_links_bytes; + double mult; + uint32_t version; + uint32_t dtype; + uint32_t metric; + uint32_t num_layers; + int32_t maxlevel; + int32_t enterpoint_node; + uint32_t reserved0; + uint32_t reserved1; +}; +static_assert(sizeof(layered_hnsw_file_header) == 192, + "layered_hnsw_file_header must keep a fixed 192-byte on-disk layout"); + +struct layered_hnsw_layer_descriptor { + uint64_t level; + uint64_t row_count; + uint64_t degree; + uint64_t node_offset; + uint64_t link_offset; +}; +static_assert(sizeof(layered_hnsw_layer_descriptor) == 40, + "layered_hnsw_layer_descriptor must keep a fixed 40-byte on-disk layout"); + +constexpr const char* layered_hnsw_magic = "CUVS_HNSW_LAYERED"; +constexpr uint32_t layered_hnsw_version = 1; +constexpr size_t layered_hnsw_alignment = 64; + +inline auto align_up(size_t value, size_t alignment) -> size_t +{ + return ((value + alignment - 1) / alignment) * alignment; +} + +template +constexpr auto layered_dtype_code() -> layered_hnsw_dtype +{ + if constexpr (std::is_same_v) { + return layered_hnsw_dtype::float32; + } else if constexpr (std::is_same_v) { + return layered_hnsw_dtype::float16; + } else if constexpr (std::is_same_v) { + return layered_hnsw_dtype::uint8; + } else if constexpr (std::is_same_v) { + return layered_hnsw_dtype::int8; + } else { + return layered_hnsw_dtype::unknown; + } +} + +inline auto layered_dtype_name(layered_hnsw_dtype dtype) -> const char* +{ + switch (dtype) { + case layered_hnsw_dtype::float32: return "float32"; + case layered_hnsw_dtype::float16: return "float16"; + case layered_hnsw_dtype::uint8: return "uint8"; + case layered_hnsw_dtype::int8: return "int8"; + default: return "unknown"; + } +} + +inline auto metric_name(cuvs::distance::DistanceType metric) -> const char* +{ + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: return "L2Expanded"; + case cuvs::distance::DistanceType::InnerProduct: return "InnerProduct"; + default: return "Unknown"; + } +} + +template +auto make_layered_hnsw_header(const layered_hnsw_file_metadata& metadata, + cuvs::distance::DistanceType metric) -> layered_hnsw_file_header +{ + layered_hnsw_file_header header{}; + std::strncpy(header.magic, layered_hnsw_magic, sizeof(header.magic) - 1); + header.version = layered_hnsw_version; + header.dtype = static_cast(layered_dtype_code()); + header.metric = static_cast(metric); + header.num_layers = static_cast(metadata.layers.size()); + header.n_rows = metadata.n_rows; + header.dim = metadata.dim; + header.M = metadata.M; + header.maxM = metadata.maxM; + header.maxM0 = metadata.maxM0; + header.ef_construction = metadata.ef_construction; + header.base_degree = metadata.base_degree; + header.levels_bytes = metadata.levels_bytes; + header.base_nodes_bytes = metadata.base_nodes_bytes; + header.base_link_row_bytes = metadata.base_link_row_bytes; + header.base_links_bytes = metadata.base_links_bytes; + header.upper_nodes_count = metadata.upper_nodes_count; + header.upper_nodes_bytes = metadata.upper_nodes_bytes; + header.upper_link_row_bytes = metadata.upper_link_row_bytes; + header.upper_links_bytes = metadata.upper_links_bytes; + header.mult = metadata.mult; + header.maxlevel = metadata.maxlevel; + header.enterpoint_node = metadata.enterpoint_node; + return header; +} + +inline auto layered_hnsw_metadata_from_header(const layered_hnsw_file_header& header) + -> layered_hnsw_file_metadata +{ + layered_hnsw_file_metadata metadata; + metadata.n_rows = header.n_rows; + metadata.dim = header.dim; + metadata.M = header.M; + metadata.maxM = header.maxM; + metadata.maxM0 = header.maxM0; + metadata.ef_construction = header.ef_construction; + metadata.base_degree = header.base_degree; + metadata.levels_bytes = header.levels_bytes; + metadata.base_nodes_bytes = header.base_nodes_bytes; + metadata.base_link_row_bytes = header.base_link_row_bytes; + metadata.base_links_bytes = header.base_links_bytes; + metadata.upper_nodes_count = header.upper_nodes_count; + metadata.upper_nodes_bytes = header.upper_nodes_bytes; + metadata.upper_link_row_bytes = header.upper_link_row_bytes; + metadata.upper_links_bytes = header.upper_links_bytes; + metadata.mult = header.mult; + metadata.maxlevel = header.maxlevel; + metadata.enterpoint_node = header.enterpoint_node; + return metadata; +} + +struct npy_file { + cuvs::util::file_descriptor fd; + size_t header_size = 0; + std::vector shape; +}; + +inline auto open_npy_file(const std::string& path) -> npy_file +{ + std::ifstream stream(path, std::ios::binary); + RAFT_EXPECTS(stream.good(), "Failed to open numpy file: %s", path.c_str()); + auto header = raft::detail::numpy_serializer::read_header(stream); + auto header_size = static_cast(stream.tellg()); + auto fd = cuvs::util::file_descriptor(path, O_RDONLY); + return {std::move(fd), header_size, header.shape}; +} + +inline auto ends_with(const std::string& value, const std::string& suffix) -> bool +{ + return value.size() >= suffix.size() && + value.compare(value.size() - suffix.size(), suffix.size(), suffix) == 0; +} + +inline void copy_file_overwrite(const std::filesystem::path& src, const std::filesystem::path& dst) +{ + std::error_code ec; + if (src == dst || + (std::filesystem::exists(dst, ec) && std::filesystem::equivalent(src, dst, ec))) { + return; + } + if (!dst.parent_path().empty()) { std::filesystem::create_directories(dst.parent_path()); } + std::filesystem::copy_file(src, dst, std::filesystem::copy_options::overwrite_existing); +} + +template +inline auto open_layered_dataset_file(const std::string& path) -> npy_file +{ + if (std::filesystem::path(path).extension() == ".npy") { return open_npy_file(path); } + + std::ifstream stream(path, std::ios::binary); + RAFT_EXPECTS(stream.good(), "Failed to open dataset file: %s", path.c_str()); + std::array header{}; + stream.read(reinterpret_cast(header.data()), sizeof(uint32_t) * header.size()); + RAFT_EXPECTS(stream.gcount() == static_cast(sizeof(uint32_t) * header.size()), + "Failed to read ANN benchmark binary dataset header: %s", + path.c_str()); + + const auto ext = std::filesystem::path(path).extension().string(); + if constexpr (std::is_same_v) { + RAFT_EXPECTS(ext == ".fbin" && !ends_with(path, ".fp16.fbin"), + "Expected a .fbin dataset for float layered HNSW load: %s", + path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS(ext == ".f16bin" || ends_with(path, ".fp16.fbin"), + "Expected a .f16bin or .fp16.fbin dataset for half layered HNSW load: %s", + path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + ext == ".u8bin", "Expected a .u8bin dataset for uint8 layered HNSW load: %s", path.c_str()); + } else if constexpr (std::is_same_v) { + RAFT_EXPECTS( + ext == ".i8bin", "Expected a .i8bin dataset for int8 layered HNSW load: %s", path.c_str()); + } + + auto fd = cuvs::util::file_descriptor(path, O_RDONLY); + return {std::move(fd), sizeof(uint32_t) * header.size(), {header[0], header[1]}}; +} + +template +void write_layered_base_links_from_disk(const cuvs::neighbors::cagra::index& index_, + const cuvs::util::file_descriptor& output_fd, + size_t base_nodes_offset, + size_t base_links_offset, + size_t base_link_row_bytes, + size_t maxM0) +{ + static_assert(std::is_same_v, + "Layered HNSW artifacts store topology ids as uint32_t"); + const auto& graph_fd_opt = index_.graph_fd(); + const auto& mapping_fd_opt = index_.mapping_fd(); + RAFT_EXPECTS(graph_fd_opt.has_value() && graph_fd_opt->is_valid(), + "Graph file descriptor is not available"); + RAFT_EXPECTS(mapping_fd_opt.has_value() && mapping_fd_opt->is_valid(), + "Mapping file descriptor is not available"); + + const auto graph_path = graph_fd_opt->get_path(); + const auto mapping_path = mapping_fd_opt->get_path(); + RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor"); + RAFT_EXPECTS(!mapping_path.empty(), "Unable to get path from mapping file descriptor"); + + auto graph_npy = open_npy_file(graph_path); + auto mapping_npy = open_npy_file(mapping_path); + + RAFT_EXPECTS(graph_npy.shape.size() == 2, "Graph file should be 2D"); + RAFT_EXPECTS(mapping_npy.shape.size() == 1, "Mapping file should be 1D"); + const auto n_rows = graph_npy.shape[0]; + const auto degree = graph_npy.shape[1]; + RAFT_EXPECTS(mapping_npy.shape[0] == n_rows, + "Mapping elements (%zu) != graph rows (%zu)", + mapping_npy.shape[0], + n_rows); + RAFT_EXPECTS(n_rows == static_cast(index_.size()), + "Graph rows (%zu) != index size (%zu)", + n_rows, + static_cast(index_.size())); + RAFT_EXPECTS(degree == static_cast(index_.graph_degree()), + "Graph degree (%zu) != index graph degree (%zu)", + degree, + static_cast(index_.graph_degree())); + RAFT_EXPECTS(degree > 0, "Graph degree must be nonzero"); + RAFT_EXPECTS( + degree <= maxM0, "Base graph degree (%zu) must not exceed HNSW maxM0 (%zu)", degree, maxM0); + RAFT_EXPECTS(base_link_row_bytes >= sizeof(hnswlib::linklistsizeint) + maxM0 * sizeof(IdxT), + "Base link row size is too small"); + + RAFT_LOG_INFO("Layered HNSW artifact: loading ACE row mapping (%zu rows)", n_rows); + const auto mapping_start_time = std::chrono::steady_clock::now(); + const auto mapping_bytes = n_rows * sizeof(IdxT); + std::vector reordered_to_original(n_rows); + cuvs::util::read_large_file( + mapping_npy.fd, reordered_to_original.data(), mapping_bytes, mapping_npy.header_size); + for (size_t reordered_id = 0; reordered_id < n_rows; ++reordered_id) { + const auto original_id = static_cast(reordered_to_original[reordered_id]); + RAFT_EXPECTS(original_id < n_rows, + "Invalid original id %zu in ACE dataset mapping at row %zu", + original_id, + reordered_id); + } + const auto mapping_elapsed_ms = elapsed_ms_since(mapping_start_time); + RAFT_LOG_INFO("Layered HNSW artifact: ACE row mapping loaded in %ld ms (%.2f GiB, %.2f GiB/s)", + mapping_elapsed_ms, + to_gib(mapping_bytes), + throughput_gib_per_s(mapping_bytes, mapping_elapsed_ms)); + + RAFT_LOG_INFO( + "Layered HNSW artifact: writing base topology with source-sequential graph reads " + "(rows=%zu degree=%zu maxM0=%zu)", + n_rows, + degree, + maxM0); + + const auto total_start_time = std::chrono::steady_clock::now(); + const auto graph_row_bytes = degree * sizeof(IdxT); + const auto base_node_row_bytes = sizeof(IdxT); + const auto base_topology_row_size = graph_row_bytes + base_node_row_bytes + base_link_row_bytes; + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t batch_size = std::max(1, target_batch_bytes / base_topology_row_size); + auto graph_buffer = raft::make_host_matrix(static_cast(batch_size), + static_cast(degree)); + std::vector base_node_buffer(batch_size); + std::vector base_link_buffer(batch_size * base_link_row_bytes); + + size_t graph_bytes_read = 0; + size_t node_bytes_written = 0; + size_t link_bytes_written = 0; + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology batch_size=%zu graph_row=%zu bytes node_row=%zu bytes " + "link_row=%zu bytes", + batch_size, + graph_row_bytes, + base_node_row_bytes, + base_link_row_bytes); + size_t next_report_percent = 10; + for (size_t source_start = 0; source_start < n_rows; source_start += batch_size) { + const auto current_batch_size = std::min(batch_size, n_rows - source_start); + const auto batch_bytes = current_batch_size * graph_row_bytes; + cuvs::util::read_large_file(graph_npy.fd, + graph_buffer.data_handle(), + batch_bytes, + graph_npy.header_size + source_start * graph_row_bytes); + graph_bytes_read += batch_bytes; + + // Rows stay in source (ACE-reordered) order so the artifact is written sequentially on the + // build node. Each row records its original ID in base_nodes and remaps its neighbors to + // original IDs. The search node scatters each row into get_linklist0(original_id) on load. + bool invalid_neighbor = false; +#pragma omp parallel for reduction(|| : invalid_neighbor) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto source_row = source_start + static_cast(batch_idx); + const auto original_id = static_cast(reordered_to_original[source_row]); + base_node_buffer[batch_idx] = static_cast(original_id); + auto* dst_row_ptr = base_link_buffer.data() + batch_idx * base_link_row_bytes; + hnswlib::linklistsizeint list_count = static_cast(degree); + std::memcpy(dst_row_ptr, &list_count, sizeof(list_count)); + auto* dst = reinterpret_cast(dst_row_ptr + sizeof(hnswlib::linklistsizeint)); + auto* src = graph_buffer.data_handle() + batch_idx * degree; + for (size_t j = 0; j < degree; ++j) { + const auto neighbor = static_cast(src[j]); + if (neighbor >= n_rows) { + invalid_neighbor = true; + continue; + } + dst[j] = reordered_to_original[neighbor]; + } + const auto written_bytes = sizeof(hnswlib::linklistsizeint) + degree * sizeof(IdxT); + if (written_bytes < base_link_row_bytes) { + std::memset(dst_row_ptr + written_bytes, 0, base_link_row_bytes - written_bytes); + } + } + RAFT_EXPECTS(!invalid_neighbor, "Invalid reordered neighbor id in ACE graph"); + + const auto current_node_bytes = current_batch_size * sizeof(IdxT); + const auto current_link_bytes = current_batch_size * base_link_row_bytes; + cuvs::util::write_large_file(output_fd, + base_node_buffer.data(), + current_node_bytes, + base_nodes_offset + source_start * sizeof(IdxT)); + cuvs::util::write_large_file(output_fd, + base_link_buffer.data(), + current_link_bytes, + base_links_offset + source_start * base_link_row_bytes); + node_bytes_written += current_node_bytes; + link_bytes_written += current_link_bytes; + + const auto completed_rows = source_start + current_batch_size; + const auto report_percent = progress_step_10(completed_rows, n_rows); + if (report_percent >= next_report_percent) { + const auto elapsed = elapsed_ms_since(total_start_time); + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology progress %zu%% (%zu/%zu rows, read=%.2f GiB " + "write=%.2f GiB, %ld ms)", + report_percent, + completed_rows, + n_rows, + to_gib(graph_bytes_read), + to_gib(node_bytes_written + link_bytes_written), + elapsed); + next_report_percent = report_percent + 10; + } + } + + const auto total_elapsed_ms = elapsed_ms_since(total_start_time); + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology written in %ld ms (graph_read=%.2f GiB %.2f GiB/s, " + "artifact_write=%.2f GiB %.2f GiB/s, nodes=%.2f GiB links=%.2f GiB)", + total_elapsed_ms, + to_gib(graph_bytes_read), + throughput_gib_per_s(graph_bytes_read, total_elapsed_ms), + to_gib(node_bytes_written + link_bytes_written), + throughput_gib_per_s(node_bytes_written + link_bytes_written, total_elapsed_ms), + to_gib(node_bytes_written), + to_gib(link_bytes_written)); +} + +template +auto serialize_to_layered_hnsw_from_disk( + raft::resources const& res, + const cuvs::neighbors::hnsw::index_params& params, + const cuvs::neighbors::cagra::index& index_, + raft::host_matrix_view dataset) -> std::string +{ + raft::common::nvtx::range fun_scope("hnsw::serialize_layered"); + const auto total_start_time = std::chrono::steady_clock::now(); + + RAFT_EXPECTS(index_.graph_fd().has_value() && index_.mapping_fd().has_value(), + "Layered HNSW serialization requires a disk-backed ACE graph and mapping."); + RAFT_EXPECTS(static_cast(dataset.extent(0)) == static_cast(index_.size()), + "Dataset rows (%zu) must match index size (%zu)", + static_cast(dataset.extent(0)), + static_cast(index_.size())); + RAFT_EXPECTS(static_cast(dataset.extent(1)) == static_cast(index_.dim()), + "Dataset cols (%zu) must match index dimensions (%zu)", + static_cast(dataset.extent(1)), + static_cast(index_.dim())); + + const auto graph_path = index_.graph_fd()->get_path(); + RAFT_EXPECTS(!graph_path.empty(), "Unable to get path from graph file descriptor"); + const auto ace_dir = std::filesystem::path(graph_path).parent_path(); + const auto artifact_file = ace_dir / "hnsw_index.cuvs"; + std::filesystem::create_directories(ace_dir); + + auto n_rows = static_cast(index_.size()); + auto dim = static_cast(index_.dim()); + auto graph_degree_int = static_cast(index_.graph_degree()); + + auto hnsw_index = + std::make_unique>(index_.dim(), index_.metric(), params.hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction); + + RAFT_LOG_INFO("Layered HNSW artifact: generating hierarchy levels"); + const auto hierarchy_start_time = std::chrono::steady_clock::now(); + auto hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, nullptr); + const auto hierarchy_elapsed_ms = elapsed_ms_since(hierarchy_start_time); + RAFT_LOG_INFO( + "Layered HNSW artifact: hierarchy levels generated in %ld ms (max_level=%d promoted=%zu)", + hierarchy_elapsed_ms, + hierarchy.max_level(), + hierarchy.promoted_count()); + + layered_hnsw_file_metadata metadata; + metadata.n_rows = n_rows; + metadata.dim = dim; + metadata.M = appr_algo->M_; + metadata.maxM = appr_algo->maxM_; + metadata.maxM0 = appr_algo->maxM0_; + metadata.ef_construction = appr_algo->ef_construction_; + metadata.mult = appr_algo->mult_; + metadata.maxlevel = hierarchy.max_level(); + metadata.enterpoint_node = static_cast(hierarchy.order.back()); + metadata.base_degree = static_cast(graph_degree_int); + metadata.levels_bytes = n_rows * sizeof(uint8_t); + metadata.base_nodes_bytes = n_rows * sizeof(IdxT); + metadata.base_link_row_bytes = appr_algo->size_links_level0_; + metadata.base_links_bytes = n_rows * metadata.base_link_row_bytes; + metadata.upper_link_row_bytes = appr_algo->size_links_per_element_; + size_t next_upper_node_offset = 0; + size_t next_upper_link_offset = 0; + for (size_t pt_level = 1; pt_level < hierarchy.hist.size(); pt_level++) { + auto start_idx = hierarchy.offsets[pt_level - 1]; + auto row_count = n_rows - start_idx; + auto layer_degree = hnsw_upper_layer_degree(row_count, appr_algo->M_); + metadata.layers.push_back( + {pt_level, row_count, layer_degree, next_upper_node_offset, next_upper_link_offset}); + next_upper_node_offset += row_count; + next_upper_link_offset += row_count; + } + metadata.upper_nodes_count = next_upper_node_offset; + metadata.upper_nodes_bytes = metadata.upper_nodes_count * sizeof(IdxT); + metadata.upper_links_bytes = next_upper_link_offset * metadata.upper_link_row_bytes; + + const auto header = make_layered_hnsw_header(metadata, index_.metric()); + std::vector layer_descriptors(metadata.layers.size()); + for (size_t i = 0; i < metadata.layers.size(); ++i) { + const auto& layer = metadata.layers[i]; + layer_descriptors[i] = {static_cast(layer.level), + static_cast(layer.row_count), + static_cast(layer.degree), + static_cast(layer.node_offset), + static_cast(layer.link_offset)}; + } + const auto descriptors_offset = sizeof(layered_hnsw_file_header); + const auto descriptors_bytes = layer_descriptors.size() * sizeof(layered_hnsw_layer_descriptor); + + const auto payload_offset = + align_up(descriptors_offset + descriptors_bytes, layered_hnsw_alignment); + const auto levels_offset = payload_offset; + const auto base_nodes_offset = levels_offset + metadata.levels_bytes; + const auto base_links_offset = base_nodes_offset + metadata.base_nodes_bytes; + const auto upper_nodes_offset = base_links_offset + metadata.base_links_bytes; + const auto upper_links_offset = upper_nodes_offset + metadata.upper_nodes_bytes; + const auto final_file_size = upper_links_offset + metadata.upper_links_bytes; + + cuvs::util::file_descriptor artifact_fd(artifact_file.string(), O_CREAT | O_RDWR | O_TRUNC, 0644); + cuvs::util::preallocate_file(artifact_fd, final_file_size); + cuvs::util::write_large_file(artifact_fd, &header, sizeof(header), 0); + if (descriptors_bytes > 0) { + cuvs::util::write_large_file( + artifact_fd, layer_descriptors.data(), descriptors_bytes, descriptors_offset); + } + + const auto levels_start_time = std::chrono::steady_clock::now(); + cuvs::util::write_large_file( + artifact_fd, hierarchy.levels.data(), metadata.levels_bytes, levels_offset); + const auto levels_elapsed_ms = elapsed_ms_since(levels_start_time); + RAFT_LOG_INFO("Layered HNSW artifact: levels section written in %ld ms (%.2f GiB, %.2f GiB/s)", + levels_elapsed_ms, + to_gib(metadata.levels_bytes), + throughput_gib_per_s(metadata.levels_bytes, levels_elapsed_ms)); + + RAFT_LOG_INFO("Layered HNSW artifact: writing hnswlib-ready base topology section"); + const auto layer0_start_time = std::chrono::steady_clock::now(); + write_layered_base_links_from_disk(index_, + artifact_fd, + base_nodes_offset, + base_links_offset, + metadata.base_link_row_bytes, + metadata.maxM0); + const auto layer0_elapsed_ms = elapsed_ms_since(layer0_start_time); + static_cast(layer0_elapsed_ms); + RAFT_LOG_INFO( + "Layered HNSW artifact: base topology section written in %ld ms (%.2f GiB, %.2f GiB/s)", + layer0_elapsed_ms, + to_gib(metadata.base_nodes_bytes + metadata.base_links_bytes), + throughput_gib_per_s(metadata.base_nodes_bytes + metadata.base_links_bytes, layer0_elapsed_ms)); + + size_t upper_graph_bytes_written = 0; + if (hierarchy.hist.size() > 1) { + RAFT_LOG_INFO("Layered HNSW artifact: gathering promoted vectors"); + const auto gather_start_time = std::chrono::steady_clock::now(); + auto host_query_set = + raft::make_host_matrix(static_cast(hierarchy.promoted_count()), dim); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(n_rows); i++) { + if (hierarchy.levels[i] > 0) { + const auto query_row = hierarchy.order_bw[i] - hierarchy.hist[0]; + auto* dst = host_query_set.data_handle() + query_row * dim; + std::copy(&dataset(i, 0), &dataset(i, 0) + dim, dst); + } + } + const auto gather_elapsed_ms = elapsed_ms_since(gather_start_time); + const auto gathered_bytes = hierarchy.promoted_count() * dim * sizeof(T); + RAFT_LOG_INFO( + "Layered HNSW artifact: promoted vectors gathered in %ld ms (%.2f GiB copied, %.2f GiB/s)", + gather_elapsed_ms, + to_gib(gathered_bytes), + throughput_gib_per_s(gathered_bytes, gather_elapsed_ms)); + + auto promoted_dataset = raft::make_host_matrix_view( + host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)); + const auto upper_layers_start_time = std::chrono::steady_clock::now(); + build_hnsw_upper_layer_graphs( + res, + promoted_dataset, + hierarchy, + appr_algo->M_, + index_.metric(), + [&](size_t pt_level, size_t start_idx, auto& host_neighbors) { + const auto& layer = metadata.layers[pt_level - 1]; + RAFT_LOG_DEBUG("Layered HNSW artifact: writing upper layer level=%zu rows=%zu degree=%zu", + layer.level, + layer.row_count, + layer.degree); + const auto layer_write_start_time = std::chrono::steady_clock::now(); + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t row_bytes = sizeof(IdxT) + metadata.upper_link_row_bytes; + const size_t batch_size = std::max(1, target_batch_bytes / row_bytes); + std::vector node_buffer(batch_size); + std::vector link_buffer(batch_size * metadata.upper_link_row_bytes); + for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += batch_size) { + const auto current_batch_size = std::min(batch_size, layer.row_count - batch_start); + std::fill(link_buffer.begin(), + link_buffer.begin() + current_batch_size * metadata.upper_link_row_bytes, + 0); + // Upper-layer rows stay in promoted (per-level) order for sequential writes. node_buffer + // records each node's original ID and neighbors are remapped to original IDs. The search + // node scatters each row into get_linklist(original_id, level) on load. +#pragma omp parallel for + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); + ++batch_idx) { + const auto row = batch_start + static_cast(batch_idx); + node_buffer[batch_idx] = static_cast(hierarchy.order[start_idx + row]); + auto* link_row = link_buffer.data() + batch_idx * metadata.upper_link_row_bytes; + hnswlib::linklistsizeint list_count = + static_cast(layer.degree); + std::memcpy(link_row, &list_count, sizeof(list_count)); + auto* dst = reinterpret_cast(link_row + sizeof(hnswlib::linklistsizeint)); + if (layer.degree > 0) { + auto* src = host_neighbors.data_handle() + row * layer.degree; + for (size_t j = 0; j < layer.degree; ++j) { + dst[j] = static_cast(hierarchy.order[src[j] + start_idx]); + } + } + } + cuvs::util::write_large_file( + artifact_fd, + node_buffer.data(), + current_batch_size * sizeof(IdxT), + upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(IdxT)); + cuvs::util::write_large_file( + artifact_fd, + link_buffer.data(), + current_batch_size * metadata.upper_link_row_bytes, + upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); + } + const auto layer_bytes = layer.row_count * (sizeof(IdxT) + metadata.upper_link_row_bytes); + upper_graph_bytes_written += layer_bytes; + const auto layer_write_elapsed_ms = elapsed_ms_since(layer_write_start_time); + static_cast(layer_write_elapsed_ms); + RAFT_LOG_INFO( + "Layered HNSW artifact: upper layer level=%zu written in %ld ms (%.2f GiB, %.2f GiB/s)", + layer.level, + layer_write_elapsed_ms, + to_gib(layer_bytes), + throughput_gib_per_s(layer_bytes, layer_write_elapsed_ms)); + }); + const auto upper_layers_elapsed_ms = elapsed_ms_since(upper_layers_start_time); + RAFT_LOG_INFO( + "Layered HNSW artifact: upper layers generated and written in %ld ms (%.2f GiB, %.2f " + "GiB/s)", + upper_layers_elapsed_ms, + to_gib(upper_graph_bytes_written), + throughput_gib_per_s(upper_graph_bytes_written, upper_layers_elapsed_ms)); + } + + const auto total_elapsed_ms = elapsed_ms_since(total_start_time); + RAFT_LOG_INFO("Layered HNSW artifact: wrote %s in %ld ms (artifact=%.2f GiB, %.2f GiB/s)", + artifact_file.string().c_str(), + total_elapsed_ms, + to_gib(final_file_size), + throughput_gib_per_s(final_file_size, total_elapsed_ms)); + return artifact_file.string(); +} + +// Build the standard hnswlib index header (matches HierarchicalNSW::saveIndex byte layout). Single +// source of the on-disk header encoding, shared by the streaming serialize path and the +// disk-to-disk materialize path so the two cannot drift. +inline auto make_hnswlib_native_header(size_t offset_level0, + size_t n_rows, + size_t size_data_per_element, + size_t label_offset, + size_t offset_data, + int maxlevel, + int enterpoint_node, + size_t maxM, + size_t maxM0, + size_t M, + double mult, + size_t ef_construction) -> std::vector +{ + std::vector buffer; + auto append = [&buffer](const auto& value) { + const auto* bytes = reinterpret_cast(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(value)); + }; + const size_t max_elements = n_rows; + const size_t cur_element_count = n_rows; + append(offset_level0); + append(max_elements); + append(cur_element_count); + append(size_data_per_element); + append(label_offset); + append(offset_data); + append(maxlevel); + append(enterpoint_node); + append(maxM); + append(maxM0); + append(M); + append(mult); + append(ef_construction); + return buffer; +} + template void serialize_to_hnswlib_from_disk(raft::resources const& res, std::ostream& os_raw, @@ -501,44 +1372,12 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, hnsw_index->get_space(), 1, (graph_degree_int + 1) / 2, params.ef_construction); bool create_hierarchy = params.hierarchy != HnswHierarchy::NONE; - - // create hierarchy order - // sort the points by levels - // roll dice & build histogram - std::vector hist; - std::vector order(n_rows); - std::vector order_bw(n_rows); - std::vector levels(n_rows); - std::vector offsets; - - if (create_hierarchy) { - RAFT_LOG_INFO("Sort points by levels"); - for (int64_t i = 0; i < n_rows; i++) { - auto pt_level = appr_algo->getRandomLevel(appr_algo->mult_); - while (pt_level >= static_cast(hist.size())) - hist.push_back(0); - hist[pt_level]++; - levels[i] = pt_level; - } - - // accumulate - offsets.resize(hist.size() + 1, 0); - for (size_t i = 0; i < hist.size() - 1; i++) { - offsets[i + 1] = offsets[i] + hist[i]; - RAFT_LOG_INFO("Level %zu : %zu", i + 1, size_t(n_rows) - offsets[i + 1]); - } - - // fw/bw indices - for (int64_t i = 0; i < n_rows; i++) { - auto pt_level = levels[i]; - order_bw[i] = offsets[pt_level]; - order[offsets[pt_level]++] = i; - } - } + hnsw_level_plan hierarchy; + if (create_hierarchy) { hierarchy = make_random_hnsw_level_plan(n_rows, *appr_algo, "Level "); } // set last point of the highest level as the entry point - appr_algo->enterpoint_node_ = create_hierarchy ? order.back() : n_rows / 2; - appr_algo->maxlevel_ = create_hierarchy ? hist.size() - 1 : 1; + appr_algo->enterpoint_node_ = create_hierarchy ? hierarchy.order.back() : n_rows / 2; + appr_algo->maxlevel_ = create_hierarchy ? hierarchy.max_level() : 1; // write header information RAFT_LOG_DEBUG("Writing HNSW header: offsetLevel0=%zu, n_rows=%zu, size_data_per_element=%zu", @@ -552,37 +1391,25 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, appr_algo->maxM0_, appr_algo->M_); - // offset_level_0 - os.write(reinterpret_cast(&appr_algo->offsetLevel0_), sizeof(std::size_t)); - // 8 max_element - override with n_rows - size_t num_elements = (size_t)n_rows; - os.write(reinterpret_cast(&num_elements), sizeof(std::size_t)); - // 16 curr_element_count - override with n_rows - os.write(reinterpret_cast(&num_elements), sizeof(std::size_t)); - // 24 size_data_per_element - os.write(reinterpret_cast(&appr_algo->size_data_per_element_), sizeof(std::size_t)); - // 32 label_offset - os.write(reinterpret_cast(&appr_algo->label_offset_), sizeof(std::size_t)); - // 40 offset_data - os.write(reinterpret_cast(&appr_algo->offsetData_), sizeof(std::size_t)); - // 48 maxlevel - os.write(reinterpret_cast(&appr_algo->maxlevel_), sizeof(int)); - // 52 enterpoint_node - os.write(reinterpret_cast(&appr_algo->enterpoint_node_), sizeof(int)); - // 56 maxM - os.write(reinterpret_cast(&appr_algo->maxM_), sizeof(std::size_t)); - // 64 maxM0 - os.write(reinterpret_cast(&appr_algo->maxM0_), sizeof(std::size_t)); - // 72 M - os.write(reinterpret_cast(&appr_algo->M_), sizeof(std::size_t)); - // 80 mult - os.write(reinterpret_cast(&appr_algo->mult_), sizeof(double)); - // 88 ef_construction - os.write(reinterpret_cast(&appr_algo->ef_construction_), sizeof(std::size_t)); + // Write the hnswlib index header (max_element / cur_element_count overridden with n_rows) using + // the shared encoder so the byte layout stays in lockstep with the materialize path. + const auto native_header = make_hnswlib_native_header(appr_algo->offsetLevel0_, + static_cast(n_rows), + appr_algo->size_data_per_element_, + appr_algo->label_offset_, + appr_algo->offsetData_, + appr_algo->maxlevel_, + appr_algo->enterpoint_node_, + appr_algo->maxM_, + appr_algo->maxM0_, + appr_algo->M_, + appr_algo->mult_, + appr_algo->ef_construction_); + os.write(native_header.data(), static_cast(native_header.size())); // host queries auto host_query_set = - raft::make_host_matrix(create_hierarchy ? n_rows - hist[0] : 0, dim); + raft::make_host_matrix(create_hierarchy ? hierarchy.promoted_count() : 0, dim); int64_t d_report_offset = n_rows / 10; // Report progress in 10% steps. int64_t next_report_offset = d_report_offset; @@ -690,11 +1517,11 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, const T* data_row = &dataset_buffer(batch_idx, 0); os.write(reinterpret_cast(data_row), sizeof(T) * dim); - if (create_hierarchy && levels[i] > 0) { + if (create_hierarchy && hierarchy.levels[i] > 0) { // position in query: order_bw[i]-hist[0] - std::copy(data_row, - data_row + dim, - reinterpret_cast(&host_query_set(order_bw[i] - hist[0], 0))); + auto* dst = host_query_set.data_handle() + + (hierarchy.order_bw[i] - hierarchy.hist[0]) * static_cast(dim); + std::copy(data_row, data_row + dim, dst); } // assign original label @@ -732,22 +1559,15 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, // trigger knn builds for all levels std::vector> host_neighbors; if (create_hierarchy) { - for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) { - auto num_pts = n_rows - offsets[pt_level - 1]; - auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1; - host_neighbors.emplace_back(raft::make_host_matrix(num_pts, neighbor_size)); - } - for (size_t pt_level = 1; pt_level < hist.size(); pt_level++) { - RAFT_LOG_INFO("Compute hierarchy neighbors level %zu", pt_level); - auto removed_rows = offsets[pt_level - 1] - offsets[0]; - raft::host_matrix_view sub_query_view( - host_query_set.data_handle() + removed_rows * dim, - host_query_set.extent(0) - removed_rows, - dim); - auto neighbor_view = host_neighbors[pt_level - 1].view(); - all_neighbors_graph( - res, raft::make_const_mdspan(sub_query_view), neighbor_view, index_.metric()); - } + host_neighbors.reserve(hierarchy.hist.size() - 1); + build_hnsw_upper_layer_graphs( + res, + raft::make_host_matrix_view( + host_query_set.data_handle(), host_query_set.extent(0), host_query_set.extent(1)), + hierarchy, + appr_algo->M_, + index_.metric(), + [&](size_t, size_t, auto& neighbors) { host_neighbors.emplace_back(std::move(neighbors)); }); } if (create_hierarchy) { @@ -758,7 +1578,7 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, start_clock = std::chrono::system_clock::now(); for (int64_t i = 0; i < n_rows; i++) { - size_t cur_level = create_hierarchy ? levels[i] : 0; + size_t cur_level = create_hierarchy ? hierarchy.levels[i] : 0; unsigned int linkListSize = create_hierarchy && cur_level > 0 ? appr_algo->size_links_per_element_ * cur_level : 0; os.write(reinterpret_cast(&linkListSize), sizeof(int)); @@ -766,14 +1586,16 @@ void serialize_to_hnswlib_from_disk(raft::resources const& res, if (linkListSize) { for (size_t pt_level = 1; pt_level <= cur_level; pt_level++) { auto neighbor_view = host_neighbors[pt_level - 1].view(); - auto my_row = order_bw[i] - offsets[pt_level - 1]; + auto my_row = hierarchy.order_bw[i] - hierarchy.offsets[pt_level - 1]; - IdxT* neighbors = &neighbor_view(my_row, 0); unsigned int extent = neighbor_view.extent(1); os.write(reinterpret_cast(&extent), sizeof(int)); - for (unsigned int j = 0; j < extent; j++) { - const IdxT converted = order[neighbors[j] + offsets[pt_level - 1]]; - os.write(reinterpret_cast(&converted), sizeof(IdxT)); + if (extent > 0) { + IdxT* neighbors = &neighbor_view(my_row, 0); + for (unsigned int j = 0; j < extent; j++) { + const IdxT converted = hierarchy.order[neighbors[j] + hierarchy.offsets[pt_level - 1]]; + os.write(reinterpret_cast(&converted), sizeof(IdxT)); + } } auto remainder = appr_algo->M_ - neighbor_view.extent(1); for (size_t j = 0; j < remainder; j++) { @@ -1088,6 +1910,18 @@ std::unique_ptr> from_cagra( std::filesystem::exists(index_directory) && std::filesystem::is_directory(index_directory), "Directory '%s' does not exist", index_directory.c_str()); + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_EXPECTS(dataset.has_value(), + "Layered HNSW serialization requires the original-order dataset."); + auto artifact_path = + serialize_to_layered_hnsw_from_disk(res, params, cagra_index, dataset.value()); + + auto hnsw_index = + std::make_unique>(cagra_index.dim(), cagra_index.metric(), params.hierarchy); + hnsw_index->set_file_descriptor(cuvs::util::file_descriptor(artifact_path, O_RDONLY)); + return hnsw_index; + } + std::string index_filename = (std::filesystem::path(index_directory) / "hnsw_index.bin").string(); @@ -1118,6 +1952,8 @@ std::unique_ptr> from_cagra( return from_cagra(res, params, cagra_index, dataset); } else if (params.hierarchy == HnswHierarchy::GPU) { return from_cagra(res, params, cagra_index, dataset); + } else if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_FAIL("GPU_LAYERED_ON_DISK requires disk-backed ACE build artifacts."); } else { RAFT_FAIL("Unsupported hierarchy type"); } @@ -1228,6 +2064,13 @@ void serialize(raft::resources const& res, const std::string& filename, const in "Disk-based index file does not exist: %s", source_path.c_str()); + if (idx_impl->hierarchy() == HnswHierarchy::GPU_LAYERED_ON_DISK) { + copy_file_overwrite(source_path, filename); + RAFT_LOG_INFO( + "Copied layered HNSW index from %s to %s", source_path.c_str(), filename.c_str()); + return; + } + // Copy the file to the new location std::filesystem::copy_file( source_path, filename, std::filesystem::copy_options::overwrite_existing); @@ -1241,6 +2084,1074 @@ void serialize(raft::resources const& res, const std::string& filename, const in hnswlib_index->saveIndex(filename); } +// Parsed + fully validated view of a layered HNSW artifact: header metadata, the payload section +// offsets, and the per-row levels array. Both the in-memory deserialize path and the disk-to-disk +// materialize path consume this so the artifact format (offset arithmetic and the size-invariant +// checks) lives in a single place and cannot drift between them. +struct layered_artifact_view { + layered_hnsw_file_metadata metadata; + size_t artifact_size = 0; + size_t levels_offset = 0; + size_t base_nodes_offset = 0; + size_t base_links_offset = 0; + size_t upper_nodes_offset = 0; + size_t upper_links_offset = 0; + std::vector levels; // per-row top level, length n_rows +}; + +// Reads and validates the artifact header/descriptors/levels from an already-open artifact fd. +// Enforces every structural invariant (magic, version, dtype, metric, dim, layer count, section +// sizes, file length, and the levels max-level) BEFORE the caller allocates buffers sized from +// these fields, so a corrupt/truncated/crafted header cannot drive an out-of-bounds read or write. +template +auto read_and_validate_layered_artifact(const cuvs::util::file_descriptor& artifact_fd, + const std::string& artifact_path, + int dim, + cuvs::distance::DistanceType metric) + -> layered_artifact_view +{ + layered_hnsw_file_header header{}; + cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0); + RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0, + "Invalid layered HNSW artifact magic: %s", + artifact_path.c_str()); + RAFT_EXPECTS(header.version == layered_hnsw_version, + "Unsupported layered HNSW artifact version %u", + header.version); + RAFT_EXPECTS(header.dtype == static_cast(layered_dtype_code()), + "Layered HNSW artifact dtype (%s) does not match requested dtype (%s)", + layered_dtype_name(static_cast(header.dtype)), + layered_dtype_name(layered_dtype_code())); + RAFT_EXPECTS(header.metric == static_cast(metric), + "Layered HNSW artifact metric (%s) does not match requested metric (%s)", + metric_name(static_cast(header.metric)), + metric_name(metric)); + + layered_artifact_view view; + view.artifact_size = static_cast(std::filesystem::file_size(artifact_path)); + const auto descriptors_offset = sizeof(layered_hnsw_file_header); + const auto descriptors_bytes = + static_cast(header.num_layers) * sizeof(layered_hnsw_layer_descriptor); + RAFT_EXPECTS(descriptors_offset + descriptors_bytes <= view.artifact_size, + "Layered HNSW layer descriptors are outside artifact: offset=%zu size=%zu " + "artifact=%zu", + descriptors_offset, + descriptors_bytes, + view.artifact_size); + + std::vector layer_descriptors(header.num_layers); + if (descriptors_bytes > 0) { + cuvs::util::read_large_file( + artifact_fd, layer_descriptors.data(), descriptors_bytes, descriptors_offset); + } + auto& metadata = view.metadata; + metadata = layered_hnsw_metadata_from_header(header); + metadata.layers.reserve(layer_descriptors.size()); + for (const auto& descriptor : layer_descriptors) { + metadata.layers.push_back({static_cast(descriptor.level), + static_cast(descriptor.row_count), + static_cast(descriptor.degree), + static_cast(descriptor.node_offset), + static_cast(descriptor.link_offset)}); + } + + RAFT_EXPECTS(metadata.n_rows > 0, "Layered HNSW artifact must contain at least one row"); + RAFT_EXPECTS(metadata.dim > 0, "Layered HNSW artifact must contain at least one dimension"); + RAFT_EXPECTS(static_cast(dim) == metadata.dim, + "Layered HNSW artifact dim (%zu) does not match requested dim (%d)", + metadata.dim, + dim); + RAFT_EXPECTS(metadata.layers.size() == static_cast(metadata.maxlevel), + "Layered HNSW artifact has %zu upper layers, expected %d", + metadata.layers.size(), + metadata.maxlevel); + + // Section-size invariants: every buffer below is sized from n_rows / upper_nodes_count, so the + // declared section byte counts must match exactly or a later read/scatter would overrun. + RAFT_EXPECTS(metadata.levels_bytes == metadata.n_rows * sizeof(uint8_t), + "Layered HNSW levels section size mismatch"); + RAFT_EXPECTS(metadata.base_nodes_bytes == metadata.n_rows * sizeof(uint32_t), + "Layered HNSW base node section size mismatch"); + RAFT_EXPECTS(metadata.base_links_bytes == metadata.n_rows * metadata.base_link_row_bytes, + "Layered HNSW base links section size mismatch"); + RAFT_EXPECTS(metadata.upper_nodes_bytes == metadata.upper_nodes_count * sizeof(uint32_t), + "Layered HNSW upper node section size mismatch"); + RAFT_EXPECTS( + metadata.upper_links_bytes == metadata.upper_nodes_count * metadata.upper_link_row_bytes, + "Layered HNSW upper link section size mismatch"); + RAFT_EXPECTS(metadata.base_degree <= metadata.maxM0, + "Layered HNSW base degree (%zu) exceeds maxM0 (%zu)", + metadata.base_degree, + metadata.maxM0); + + const auto payload_offset = + align_up(descriptors_offset + descriptors_bytes, layered_hnsw_alignment); + view.levels_offset = payload_offset; + view.base_nodes_offset = view.levels_offset + metadata.levels_bytes; + view.base_links_offset = view.base_nodes_offset + metadata.base_nodes_bytes; + view.upper_nodes_offset = view.base_links_offset + metadata.base_links_bytes; + view.upper_links_offset = view.upper_nodes_offset + metadata.upper_nodes_bytes; + const auto expected_file_size = view.upper_links_offset + metadata.upper_links_bytes; + RAFT_EXPECTS(view.artifact_size >= expected_file_size, + "Layered HNSW artifact is truncated: expected at least %zu bytes, got %zu", + expected_file_size, + view.artifact_size); + + view.levels.resize(metadata.n_rows); + cuvs::util::read_large_file( + artifact_fd, view.levels.data(), metadata.levels_bytes, view.levels_offset); + const auto max_level_in_levels = *std::max_element(view.levels.begin(), view.levels.end()); + RAFT_EXPECTS(static_cast(max_level_in_levels) == metadata.maxlevel, + "Layered HNSW levels max level (%d) does not match artifact maxlevel (%d)", + static_cast(max_level_in_levels), + metadata.maxlevel); + return view; +} + +template +auto deserialize_layered_hnsw(raft::resources const& res, + const index_params& params, + const std::string& artifact_path, + int dim, + cuvs::distance::DistanceType metric) -> std::unique_ptr> +{ + common::nvtx::range fun_scope("hnsw::deserialize_layered"); + const auto total_start_time = std::chrono::steady_clock::now(); + const auto metadata_start_time = std::chrono::steady_clock::now(); + cuvs::util::file_descriptor artifact_fd(artifact_path, O_RDONLY); + auto view = read_and_validate_layered_artifact(artifact_fd, artifact_path, dim, metric); + auto& metadata = view.metadata; + const auto artifact_size = view.artifact_size; + const auto base_nodes_offset = view.base_nodes_offset; + const auto base_links_offset = view.base_links_offset; + const auto upper_nodes_offset = view.upper_nodes_offset; + const auto upper_links_offset = view.upper_links_offset; + const auto metadata_elapsed_ms = elapsed_ms_since(metadata_start_time); + + RAFT_EXPECTS(!params.dataset_path.empty(), + "Layered HNSW deserialization requires index_params.dataset_path"); + RAFT_LOG_INFO("Layered HNSW load: metadata read in %ld ms (rows=%zu dim=%zu artifact=%.2f GiB)", + metadata_elapsed_ms, + metadata.n_rows, + metadata.dim, + to_gib(artifact_size)); + + const auto dataset_open_start_time = std::chrono::steady_clock::now(); + auto dataset_file = open_layered_dataset_file(params.dataset_path); + const auto dataset_open_elapsed_ms = elapsed_ms_since(dataset_open_start_time); + + RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == metadata.n_rows && + dataset_file.shape[1] == metadata.dim, + "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset rows=%zu " + "dim=%zu path=%s", + metadata.n_rows, + metadata.dim, + dataset_file.shape.size() > 0 ? dataset_file.shape[0] : 0, + dataset_file.shape.size() > 1 ? dataset_file.shape[1] : 0, + params.dataset_path.c_str()); + + RAFT_LOG_INFO("Layered HNSW load: dataset header validated in %ld ms (%s)", + dataset_open_elapsed_ms, + params.dataset_path.c_str()); + + const auto dataset_total_bytes = metadata.n_rows * metadata.dim * sizeof(T); + const auto deserialize_progress_total_bytes = + metadata.levels_bytes + dataset_total_bytes + metadata.base_nodes_bytes + + metadata.base_links_bytes + metadata.upper_nodes_bytes + metadata.upper_links_bytes; + size_t deserialize_progress_bytes = 0; + size_t deserialize_next_progress_percent = 10; + auto log_deserialize_progress = [&](size_t bytes_loaded) { + if (deserialize_progress_total_bytes == 0) { return; } + deserialize_progress_bytes = + std::min(deserialize_progress_total_bytes, deserialize_progress_bytes + bytes_loaded); + const auto progress_percent = + (deserialize_progress_bytes * 100) / deserialize_progress_total_bytes; + const auto progress_step = std::min(100, (progress_percent / 10) * 10); + if (progress_step >= deserialize_next_progress_percent) { + RAFT_LOG_INFO("Layered HNSW load: deserialize progress %zu%% (%.2f/%.2f GiB)", + progress_step, + to_gib(deserialize_progress_bytes), + to_gib(deserialize_progress_total_bytes)); + deserialize_next_progress_percent = progress_step + 10; + } + }; + + auto& levels_u8 = view.levels; + log_deserialize_progress(metadata.levels_bytes); + + const auto allocation_start_time = std::chrono::steady_clock::now(); + auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), metadata.n_rows, metadata.M, metadata.ef_construction); + appr_algo->cur_element_count = metadata.n_rows; + appr_algo->maxlevel_ = metadata.maxlevel; + appr_algo->enterpoint_node_ = metadata.enterpoint_node; + std::fill_n(appr_algo->linkLists_, metadata.n_rows, nullptr); + RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes, + "Layered HNSW base link row size mismatch"); + RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes, + "Layered HNSW upper link row size mismatch"); + RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM, + "Layered HNSW M parameter mismatch"); + const auto allocation_elapsed_ms = elapsed_ms_since(allocation_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: allocated hnswlib storage in %ld ms (base=%.2f GiB upper=%.2f GiB)", + allocation_elapsed_ms, + to_gib(metadata.n_rows * appr_algo->size_data_per_element_), + to_gib(metadata.upper_nodes_count * metadata.upper_link_row_bytes)); + + auto num_threads = + params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads; + + const size_t target_batch_bytes = 64 * 1024 * 1024; + const size_t dataset_row_bytes = metadata.dim * sizeof(T); + const size_t batch_size = std::max(1, target_batch_bytes / dataset_row_bytes); + auto dataset_buffer = + raft::make_host_matrix(static_cast(batch_size), metadata.dim); + + const auto base_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration dataset_read_time{}; + std::chrono::steady_clock::duration base_copy_time{}; + size_t dataset_bytes_read = 0; + for (size_t batch_start = 0; batch_start < metadata.n_rows; batch_start += batch_size) { + const auto current_batch_size = std::min(batch_size, metadata.n_rows - batch_start); + const auto current_dataset_bytes = current_batch_size * metadata.dim * sizeof(T); + auto batch_timer = std::chrono::steady_clock::now(); + cuvs::util::read_large_file(dataset_file.fd, + dataset_buffer.data_handle(), + current_dataset_bytes, + dataset_file.header_size + batch_start * metadata.dim * sizeof(T)); + dataset_read_time += std::chrono::steady_clock::now() - batch_timer; + dataset_bytes_read += current_dataset_bytes; + + bool link_list_allocation_failed = false; + batch_timer = std::chrono::steady_clock::now(); +#pragma omp parallel for num_threads(num_threads) reduction(|| : link_list_allocation_failed) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto i = batch_start + static_cast(batch_idx); + auto level = static_cast(levels_u8[i]); + appr_algo->element_levels_[i] = level; + memcpy(appr_algo->getDataByInternalId(i), + dataset_buffer.data_handle() + batch_idx * metadata.dim, + appr_algo->data_size_); + *appr_algo->getExternalLabeLp(i) = static_cast(i); + if (level > 0) { + const auto link_list_size = appr_algo->size_links_per_element_ * level + 1; + appr_algo->linkLists_[i] = static_cast(malloc(link_list_size)); + if (appr_algo->linkLists_[i] == nullptr) { + link_list_allocation_failed = true; + continue; + } + } + } + if (link_list_allocation_failed) { + for (size_t i = 0; i < metadata.n_rows; ++i) { + free(appr_algo->linkLists_[i]); + appr_algo->linkLists_[i] = nullptr; + } + throw std::runtime_error("Not enough memory to allocate HNSW upper linklists"); + } + base_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_dataset_bytes); + } + const auto base_elapsed_ms = elapsed_ms_since(base_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: hnswlib data and levels initialized in %ld ms " + "(dataset read %.2f ms %.2f GiB %.2f GiB/s, copy %.2f ms)", + base_elapsed_ms, + elapsed_ms(dataset_read_time), + to_gib(dataset_bytes_read), + throughput_gib_per_s(dataset_bytes_read, dataset_read_time), + elapsed_ms(base_copy_time)); + + const auto base_topology_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration base_topology_read_time{}; + std::chrono::steady_clock::duration base_topology_copy_time{}; + size_t base_topology_bytes_read = 0; + const size_t base_topology_row_bytes = sizeof(uint32_t) + metadata.base_link_row_bytes; + const size_t base_topology_batch_size = + std::max(1, target_batch_bytes / base_topology_row_bytes); + std::vector base_node_buffer(base_topology_batch_size); + std::vector base_link_buffer(base_topology_batch_size * metadata.base_link_row_bytes); + for (size_t batch_start = 0; batch_start < metadata.n_rows; + batch_start += base_topology_batch_size) { + const auto current_batch_size = + std::min(base_topology_batch_size, metadata.n_rows - batch_start); + const auto current_base_topology_bytes = + current_batch_size * (sizeof(uint32_t) + metadata.base_link_row_bytes); + auto batch_timer = std::chrono::steady_clock::now(); + cuvs::util::read_large_file(artifact_fd, + base_node_buffer.data(), + current_batch_size * sizeof(uint32_t), + base_nodes_offset + batch_start * sizeof(uint32_t)); + cuvs::util::read_large_file(artifact_fd, + base_link_buffer.data(), + current_batch_size * metadata.base_link_row_bytes, + base_links_offset + batch_start * metadata.base_link_row_bytes); + base_topology_read_time += std::chrono::steady_clock::now() - batch_timer; + base_topology_bytes_read += current_base_topology_bytes; + + batch_timer = std::chrono::steady_clock::now(); + bool invalid_node_id = false; +#pragma omp parallel for num_threads(num_threads) reduction(|| : invalid_node_id) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); ++batch_idx) { + const auto node_id = static_cast(base_node_buffer[batch_idx]); + if (node_id >= metadata.n_rows) { + invalid_node_id = true; + continue; + } + auto ll0 = appr_algo->get_linklist0(node_id); + memcpy(ll0, + base_link_buffer.data() + batch_idx * metadata.base_link_row_bytes, + metadata.base_link_row_bytes); + } + RAFT_EXPECTS(!invalid_node_id, "Invalid base-layer node id in layered HNSW artifact"); + base_topology_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_base_topology_bytes); + } + const auto base_topology_elapsed_ms = elapsed_ms_since(base_topology_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: base-layer topology loaded in %ld ms " + "(read %.2f ms %.2f GiB %.2f GiB/s, scatter-copy %.2f ms)", + base_topology_elapsed_ms, + elapsed_ms(base_topology_read_time), + to_gib(base_topology_bytes_read), + throughput_gib_per_s(base_topology_bytes_read, base_topology_read_time), + elapsed_ms(base_topology_copy_time)); + + const auto upper_start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration upper_read_time{}; + std::chrono::steady_clock::duration upper_copy_time{}; + size_t upper_bytes_read = 0; + for (const auto& layer : metadata.layers) { + const auto layer_row_bytes = sizeof(uint32_t) + metadata.upper_link_row_bytes; + const auto layer_batch_size = + std::min(std::max(1, target_batch_bytes / layer_row_bytes), layer.row_count); + std::vector node_buffer(layer_batch_size); + std::vector link_buffer(layer_batch_size * metadata.upper_link_row_bytes); + for (size_t batch_start = 0; batch_start < layer.row_count; batch_start += layer_batch_size) { + const auto current_batch_size = std::min(layer_batch_size, layer.row_count - batch_start); + const auto current_upper_bytes = + current_batch_size * (sizeof(uint32_t) + metadata.upper_link_row_bytes); + auto batch_timer = std::chrono::steady_clock::now(); + cuvs::util::read_large_file( + artifact_fd, + node_buffer.data(), + current_batch_size * sizeof(uint32_t), + upper_nodes_offset + (layer.node_offset + batch_start) * sizeof(uint32_t)); + cuvs::util::read_large_file( + artifact_fd, + link_buffer.data(), + current_batch_size * metadata.upper_link_row_bytes, + upper_links_offset + (layer.link_offset + batch_start) * metadata.upper_link_row_bytes); + upper_read_time += std::chrono::steady_clock::now() - batch_timer; + upper_bytes_read += current_upper_bytes; + batch_timer = std::chrono::steady_clock::now(); + bool invalid_node_id = false; + bool invalid_node_level = false; +#pragma omp parallel for num_threads(num_threads) \ + reduction(|| : invalid_node_id, invalid_node_level) + for (int64_t batch_idx = 0; batch_idx < static_cast(current_batch_size); + ++batch_idx) { + const auto node_id = static_cast(node_buffer[batch_idx]); + if (node_id >= metadata.n_rows) { + invalid_node_id = true; + continue; + } + if (layer.level > static_cast(levels_u8[node_id])) { + invalid_node_level = true; + continue; + } + auto ll = appr_algo->get_linklist(node_id, layer.level); + memcpy(ll, + link_buffer.data() + batch_idx * metadata.upper_link_row_bytes, + metadata.upper_link_row_bytes); + } + RAFT_EXPECTS(!invalid_node_id, "Invalid upper-layer node id in layered HNSW artifact"); + RAFT_EXPECTS(!invalid_node_level, + "Layered HNSW artifact references a node at an invalid upper level"); + upper_copy_time += std::chrono::steady_clock::now() - batch_timer; + log_deserialize_progress(current_upper_bytes); + } + } + const auto upper_elapsed_ms = elapsed_ms_since(upper_start_time); + RAFT_LOG_INFO( + "Layered HNSW load: upper layers loaded in %ld ms " + "(read %.2f ms %.2f GiB %.2f GiB/s, validate+copy %.2f ms)", + upper_elapsed_ms, + elapsed_ms(upper_read_time), + to_gib(upper_bytes_read), + throughput_gib_per_s(upper_bytes_read, upper_read_time), + elapsed_ms(upper_copy_time)); + + hnsw_index->set_index(std::move(appr_algo)); + RAFT_LOG_INFO("Layered HNSW load: total deserialize completed in %ld ms", + elapsed_ms_since(total_start_time)); + return hnsw_index; +} + +// Disk-to-disk materialization: layered HNSW artifact -> standard hnswlib index file. +// Constants/offsets shared by the materialization helpers. +struct hnswlib_materialize_layout { + size_t n_rows = 0; + size_t dim = 0; + size_t base_link_row_bytes = 0; + size_t upper_link_row_bytes = 0; + size_t size_data_per_element = 0; + size_t offset_data = 0; + size_t label_offset = 0; + size_t data_size = 0; + // Output file offsets. + size_t base_region_offset = 0; + size_t upper_region_offset = 0; + // Artifact payload offsets. + size_t base_nodes_offset = 0; + size_t base_links_offset = 0; + size_t upper_nodes_offset = 0; + size_t upper_links_offset = 0; +}; + +// Routes fixed-size, ID-keyed records into per-bucket temporary files and replays them per bucket. +// Used to reorder the (small) base/upper topology under a bounded host-memory budget while keeping +// all disk access sequential. +struct id_record_spiller { + std::filesystem::path dir; + size_t rows_per_bucket; + size_t num_buckets; + size_t record_bytes; + size_t buffer_cap; + std::vector fds; + std::vector offsets; + std::vector> buffers; + // One mutex per bucket so distinct buckets can be appended concurrently (add() is called from + // parallel scatter threads); same-bucket appends serialize on their own mutex. + std::vector bucket_mutexes; + + id_record_spiller(std::filesystem::path dir_, + size_t num_buckets_, + size_t rows_per_bucket_, + size_t record_bytes_, + size_t total_buffer_budget) + : dir(std::move(dir_)), + rows_per_bucket(rows_per_bucket_), + num_buckets(num_buckets_), + record_bytes(record_bytes_), + offsets(num_buckets_, 0), + buffers(num_buckets_), + bucket_mutexes(num_buckets_) + { + std::filesystem::create_directories(dir); + fds.reserve(num_buckets); + for (size_t b = 0; b < num_buckets; ++b) { + fds.emplace_back((dir / ("bucket_" + std::to_string(b) + ".tmp")).string(), + O_CREAT | O_RDWR | O_TRUNC, + 0644); + } + // Floor the per-bucket buffer so a tiny budget (many buckets) still batches many records per + // flush instead of degrading into one write syscall per record. This may exceed the nominal + // budget for pathologically small budgets, but keeps disk I/O coarse-grained and sequential. + constexpr size_t kMinRecordsPerFlush = 256; + const size_t even_share = total_buffer_budget / std::max(1, num_buckets); + buffer_cap = std::max(record_bytes * kMinRecordsPerFlush, even_share); + } + + id_record_spiller(const id_record_spiller&) = delete; + id_record_spiller& operator=(const id_record_spiller&) = delete; + id_record_spiller(id_record_spiller&&) = delete; + id_record_spiller& operator=(id_record_spiller&&) = delete; + + ~id_record_spiller() noexcept + { + fds.clear(); + std::error_code ec; + std::filesystem::remove_all(dir, ec); + } + + void flush(size_t b) + { + if (buffers[b].empty()) { return; } + cuvs::util::write_large_file(fds[b], buffers[b].data(), buffers[b].size(), offsets[b]); + offsets[b] += buffers[b].size(); + buffers[b].clear(); + } + + void add(size_t bucket, const void* record) + { + std::lock_guard guard(bucket_mutexes[bucket]); + auto& buf = buffers[bucket]; + if (buf.size() + record_bytes > buffer_cap) { flush(bucket); } + const auto* p = reinterpret_cast(record); + buf.insert(buf.end(), p, p + record_bytes); + } + + void finish_writes() + { + for (size_t b = 0; b < num_buckets; ++b) { + flush(b); + } + } + + template + void replay(size_t b, std::vector& chunk, F&& consume) + { + const size_t total = offsets[b]; + const size_t recs_per_chunk = std::max(1, (size_t{64} << 20) / record_bytes); + chunk.resize(recs_per_chunk * record_bytes); + size_t read_off = 0; + while (read_off < total) { + const size_t bytes = std::min(chunk.size(), total - read_off); + cuvs::util::read_large_file(fds[b], chunk.data(), bytes, read_off); + const size_t nrec = bytes / record_bytes; + for (size_t r = 0; r < nrec; ++r) { + consume(chunk.data() + r * record_bytes); + } + read_off += bytes; + } + } +}; + +// Streams the base topology (base_nodes + base_links) in artifact (ACE) order, invoking +// `cb(original_id, link_row_ptr)` for each row. When `parallel_threads > 1` the callback is invoked +// concurrently and must be thread-safe for distinct IDs. +template +void scatter_layered_base_links(const cuvs::util::file_descriptor& artifact_fd, + const hnswlib_materialize_layout& layout, + int parallel_threads, + F&& cb) +{ + const size_t n_rows = layout.n_rows; + const size_t row = layout.base_link_row_bytes; + const size_t batch = std::max(1, (size_t{64} << 20) / (sizeof(IdxT) + row)); + std::vector node_buf(batch); + std::vector link_buf(batch * row); + for (size_t s = 0; s < n_rows; s += batch) { + const size_t cur = std::min(batch, n_rows - s); + cuvs::util::read_large_file(artifact_fd, + node_buf.data(), + cur * sizeof(IdxT), + layout.base_nodes_offset + s * sizeof(IdxT)); + cuvs::util::read_large_file( + artifact_fd, link_buf.data(), cur * row, layout.base_links_offset + s * row); + bool invalid = false; + if (parallel_threads > 1) { +#pragma omp parallel for num_threads(parallel_threads) reduction(|| : invalid) + for (int64_t k = 0; k < static_cast(cur); ++k) { + const size_t id = static_cast(node_buf[k]); + if (id >= n_rows) { + invalid = true; + continue; + } + cb(id, link_buf.data() + static_cast(k) * row); + } + } else { + for (size_t k = 0; k < cur; ++k) { + const size_t id = static_cast(node_buf[k]); + if (id >= n_rows) { + invalid = true; + break; + } + cb(id, link_buf.data() + k * row); + } + } + RAFT_EXPECTS(!invalid, "Invalid base-layer node id in layered HNSW artifact"); + } +} + +// Streams the upper layers, invoking `cb(original_id, level, link_row_ptr)` for every promoted row. +template +void scatter_layered_upper_links(const cuvs::util::file_descriptor& artifact_fd, + const hnswlib_materialize_layout& layout, + const std::vector& layers, + const std::vector& levels_u8, + F&& cb) +{ + const size_t urow = layout.upper_link_row_bytes; + const size_t n_rows = layout.n_rows; + for (const auto& layer : layers) { + const size_t rc = layer.row_count; + if (rc == 0) { continue; } + const size_t batch = std::max(1, (size_t{64} << 20) / (sizeof(uint32_t) + urow)); + std::vector nodes(std::min(batch, rc)); + std::vector links(std::min(batch, rc) * urow); + for (size_t s = 0; s < rc; s += batch) { + const size_t cur = std::min(batch, rc - s); + cuvs::util::read_large_file( + artifact_fd, + nodes.data(), + cur * sizeof(uint32_t), + layout.upper_nodes_offset + (layer.node_offset + s) * sizeof(uint32_t)); + cuvs::util::read_large_file(artifact_fd, + links.data(), + cur * urow, + layout.upper_links_offset + (layer.link_offset + s) * urow); + for (size_t r = 0; r < cur; ++r) { + const size_t id = static_cast(nodes[r]); + RAFT_EXPECTS(id < n_rows, "Invalid upper-layer node id in layered HNSW artifact"); + RAFT_EXPECTS(layer.level <= static_cast(levels_u8[id]), + "Layered HNSW artifact references a node at an invalid upper level"); + cb(id, layer.level, links.data() + r * urow); + } + } + } +} + +// Emits the level-0 region `[link block | vector | label]` per ID, in increasing ID order, reading +// the dataset sequentially and writing the output sequentially. `row_ptr(id)` returns a pointer to +// the ID's level-0 link block in a caller-owned, bounded buffer. +template +void emit_hnswlib_base_records(const npy_file& dataset_file, + const cuvs::util::file_descriptor& output_fd, + const hnswlib_materialize_layout& layout, + int num_threads, + size_t id_begin, + size_t id_end, + std::vector& out_buffer, + DatasetBatch& dataset_batch, + RowPtr&& row_ptr) +{ + const size_t spe = layout.size_data_per_element; + const size_t row = layout.base_link_row_bytes; + const size_t data_size = layout.data_size; + const size_t out_batch_rows = static_cast(dataset_batch.extent(0)); + for (size_t s = id_begin; s < id_end; s += out_batch_rows) { + const size_t cur = std::min(out_batch_rows, id_end - s); + cuvs::util::read_large_file(dataset_file.fd, + dataset_batch.data_handle(), + cur * data_size, + dataset_file.header_size + s * data_size); +#pragma omp parallel for num_threads(num_threads) + for (int64_t k = 0; k < static_cast(cur); ++k) { + const size_t id = s + static_cast(k); + char* rec = out_buffer.data() + static_cast(k) * spe; + std::memcpy(rec, row_ptr(id), row); + std::memcpy(rec + layout.offset_data, + dataset_batch.data_handle() + static_cast(k) * layout.dim, + data_size); + const hnswlib::labeltype label = static_cast(id); + std::memcpy(rec + layout.label_offset, &label, sizeof(label)); + } + cuvs::util::write_large_file( + output_fd, out_buffer.data(), cur * spe, layout.base_region_offset + s * spe); + } +} + +// Phase 1 + 2: reorder the base topology to original-ID order and emit the level-0 region. +template +void materialize_hnswlib_base_region(const cuvs::util::file_descriptor& artifact_fd, + const npy_file& dataset_file, + const cuvs::util::file_descriptor& output_fd, + const layered_hnsw_file_metadata& metadata, + const hnswlib_materialize_layout& layout, + size_t budget_bytes, + int num_threads, + const std::filesystem::path& tmp_dir) +{ + const size_t n_rows = layout.n_rows; + const size_t row = layout.base_link_row_bytes; + + const size_t out_batch_rows = + std::max(1, (size_t{64} << 20) / layout.size_data_per_element); + auto dataset_batch = raft::make_host_matrix(static_cast(out_batch_rows), + static_cast(metadata.dim)); + std::vector out_buffer(out_batch_rows * layout.size_data_per_element); + + // Tracks that every original ID in [0, n_rows) is produced exactly once by the reorder. The + // section-size invariant (validated in Phase 0) guarantees exactly n_rows base-node records, so + // a malformed artifact with a missing/duplicate ID would otherwise leave a zero-initialized link + // row in the output and silently corrupt the graph; catch it instead of shipping a bad index. + std::vector seen(n_rows, 0); + auto verify_full_coverage = [&]() { + const auto covered = static_cast(std::count(seen.begin(), seen.end(), uint8_t{1})); + RAFT_EXPECTS(covered == n_rows, + "Layered HNSW base nodes cover %zu of %zu original ids (missing or duplicate ids)", + covered, + n_rows); + }; + + const auto base_start_time = std::chrono::steady_clock::now(); + // Single in-memory pass when the whole base topology fits the budget (no temporary files). + if (budget_bytes >= metadata.base_links_bytes) { + std::vector ordered_base(metadata.base_links_bytes); + scatter_layered_base_links( + artifact_fd, layout, num_threads, [&](size_t id, const char* link_row) { + std::memcpy(ordered_base.data() + id * row, link_row, row); + seen[id] = 1; + }); + verify_full_coverage(); + emit_hnswlib_base_records(dataset_file, + output_fd, + layout, + num_threads, + 0, + n_rows, + out_buffer, + dataset_batch, + [&](size_t id) { return ordered_base.data() + id * row; }); + RAFT_LOG_INFO("hnswlib materialize: base region written (single-pass) in %ld ms (%.2f GiB)", + elapsed_ms_since(base_start_time), + to_gib(layout.size_data_per_element * n_rows)); + return; + } + + // Bucketed reorder through temporary files for a hard memory budget. + const size_t record_bytes = sizeof(IdxT) + row; + size_t rows_per_bucket = std::max(1, budget_bytes / record_bytes); + rows_per_bucket = std::min(rows_per_bucket, n_rows); + const size_t num_buckets = (n_rows + rows_per_bucket - 1) / rows_per_bucket; + RAFT_LOG_INFO( + "hnswlib materialize: base region uses %zu buckets (rows/bucket=%zu, budget=%.2f GiB)", + num_buckets, + rows_per_bucket, + to_gib(budget_bytes)); + + id_record_spiller spiller( + tmp_dir / "base", num_buckets, rows_per_bucket, record_bytes, budget_bytes / 2); + // The spiller's per-bucket mutexes make add() safe under the parallel scatter; each thread stages + // the record in its own (thread_local) buffer before the routed append. + scatter_layered_base_links( + artifact_fd, layout, num_threads, [&](size_t id, const char* link_row) { + thread_local std::vector rec; + rec.resize(record_bytes); + const IdxT id32 = static_cast(id); + std::memcpy(rec.data(), &id32, sizeof(IdxT)); + std::memcpy(rec.data() + sizeof(IdxT), link_row, row); + spiller.add(id / rows_per_bucket, rec.data()); + seen[id] = 1; + }); + spiller.finish_writes(); + verify_full_coverage(); + + std::vector bucket_rows; + std::vector chunk; + for (size_t b = 0; b < num_buckets; ++b) { + const size_t id_begin = b * rows_per_bucket; + const size_t id_end = std::min(n_rows, id_begin + rows_per_bucket); + const size_t rib = id_end - id_begin; + bucket_rows.assign(rib * row, 0); + spiller.replay(b, chunk, [&](const char* r) { + IdxT id32; + std::memcpy(&id32, r, sizeof(IdxT)); + const size_t id = static_cast(id32); + std::memcpy(bucket_rows.data() + (id - id_begin) * row, r + sizeof(IdxT), row); + }); + emit_hnswlib_base_records( + dataset_file, + output_fd, + layout, + num_threads, + id_begin, + id_end, + out_buffer, + dataset_batch, + [&](size_t id) { return bucket_rows.data() + (id - id_begin) * row; }); + } + RAFT_LOG_INFO("hnswlib materialize: base region written (%zu buckets) in %ld ms (%.2f GiB)", + num_buckets, + elapsed_ms_since(base_start_time), + to_gib(layout.size_data_per_element * n_rows)); +} + +// Phase 3: transpose the upper layers into per-element link lists and emit the upper region. +inline void materialize_hnswlib_upper_region(const cuvs::util::file_descriptor& artifact_fd, + const cuvs::util::file_descriptor& output_fd, + const layered_hnsw_file_metadata& metadata, + const hnswlib_materialize_layout& layout, + const std::vector& levels_u8, + size_t budget_bytes, + const std::filesystem::path& tmp_dir) +{ + const size_t n_rows = layout.n_rows; + const size_t urow = layout.upper_link_row_bytes; + const auto upper_start_time = std::chrono::steady_clock::now(); + + // Sequential writer over the (variable-length) upper region. + size_t write_off = layout.upper_region_offset; + std::vector out_buffer; + out_buffer.reserve((size_t{64} << 20) + urow + sizeof(int)); + auto flush_out = [&]() { + if (out_buffer.empty()) { return; } + cuvs::util::write_large_file(output_fd, out_buffer.data(), out_buffer.size(), write_off); + write_off += out_buffer.size(); + out_buffer.clear(); + }; + auto append_element = [&](size_t level, const char* rows) { + const int link_list_size = level > 0 ? static_cast(level * urow) : 0; + const auto* p = reinterpret_cast(&link_list_size); + out_buffer.insert(out_buffer.end(), p, p + sizeof(int)); + if (level > 0) { out_buffer.insert(out_buffer.end(), rows, rows + level * urow); } + if (out_buffer.size() >= (size_t{64} << 20)) { flush_out(); } + }; + + const size_t fits_budget = metadata.upper_links_bytes + (n_rows + 1) * sizeof(size_t); + if (metadata.upper_links_bytes == 0 || fits_budget <= budget_bytes) { + // Pack all upper rows in original-ID order, then stream them out. + std::vector packed_start(n_rows + 1, 0); + for (size_t i = 0; i < n_rows; ++i) { + packed_start[i + 1] = packed_start[i] + static_cast(levels_u8[i]); + } + RAFT_EXPECTS(packed_start[n_rows] == metadata.upper_nodes_count, + "Layered HNSW upper rows (%zu) do not match upper_nodes_count (%zu)", + packed_start[n_rows], + metadata.upper_nodes_count); + std::vector packed(metadata.upper_links_bytes); + scatter_layered_upper_links(artifact_fd, + layout, + metadata.layers, + levels_u8, + [&](size_t id, size_t level, const char* link_row) { + const size_t dst_row = packed_start[id] + (level - 1); + std::memcpy(packed.data() + dst_row * urow, link_row, urow); + }); + size_t cursor = 0; + for (size_t id = 0; id < n_rows; ++id) { + const size_t level = static_cast(levels_u8[id]); + append_element(level, level > 0 ? packed.data() + cursor * urow : nullptr); + cursor += level; + } + flush_out(); + RAFT_LOG_INFO("hnswlib materialize: upper region written (single-pass) in %ld ms (%.2f GiB)", + elapsed_ms_since(upper_start_time), + to_gib(metadata.upper_links_bytes)); + return; + } + + // Bucketed upper transpose for a hard memory budget. + const size_t budget_half = std::max(1, budget_bytes / 2); + size_t num_buckets = + std::max(1, (metadata.upper_links_bytes + budget_half - 1) / budget_half); + size_t rows_per_bucket = std::max(1, (n_rows + num_buckets - 1) / num_buckets); + num_buckets = (n_rows + rows_per_bucket - 1) / rows_per_bucket; + RAFT_LOG_INFO("hnswlib materialize: upper region uses %zu buckets (rows/bucket=%zu)", + num_buckets, + rows_per_bucket); + + const size_t record_bytes = 2 * sizeof(uint32_t) + urow; // [id][level][row] + id_record_spiller spiller( + tmp_dir / "upper", num_buckets, rows_per_bucket, record_bytes, budget_half); + std::vector rec(record_bytes); + scatter_layered_upper_links(artifact_fd, + layout, + metadata.layers, + levels_u8, + [&](size_t id, size_t level, const char* link_row) { + const uint32_t id32 = static_cast(id); + const uint32_t lv32 = static_cast(level); + std::memcpy(rec.data(), &id32, sizeof(uint32_t)); + std::memcpy(rec.data() + sizeof(uint32_t), &lv32, sizeof(uint32_t)); + std::memcpy(rec.data() + 2 * sizeof(uint32_t), link_row, urow); + spiller.add(id / rows_per_bucket, rec.data()); + }); + spiller.finish_writes(); + + std::vector chunk; + for (size_t b = 0; b < num_buckets; ++b) { + const size_t id_begin = b * rows_per_bucket; + const size_t id_end = std::min(n_rows, id_begin + rows_per_bucket); + const size_t rib = id_end - id_begin; + std::vector local_start(rib + 1, 0); + for (size_t i = 0; i < rib; ++i) { + local_start[i + 1] = local_start[i] + static_cast(levels_u8[id_begin + i]); + } + std::vector packed(local_start[rib] * urow); + spiller.replay(b, chunk, [&](const char* r) { + uint32_t id32 = 0; + uint32_t lv32 = 0; + std::memcpy(&id32, r, sizeof(uint32_t)); + std::memcpy(&lv32, r + sizeof(uint32_t), sizeof(uint32_t)); + const size_t local = static_cast(id32) - id_begin; + const size_t dst_row = local_start[local] + (static_cast(lv32) - 1); + std::memcpy(packed.data() + dst_row * urow, r + 2 * sizeof(uint32_t), urow); + }); + size_t cursor = 0; + for (size_t i = 0; i < rib; ++i) { + const size_t level = static_cast(levels_u8[id_begin + i]); + append_element(level, level > 0 ? packed.data() + cursor * urow : nullptr); + cursor += level; + } + } + flush_out(); + RAFT_LOG_INFO("hnswlib materialize: upper region written (%zu buckets) in %ld ms (%.2f GiB)", + num_buckets, + elapsed_ms_since(upper_start_time), + to_gib(metadata.upper_links_bytes)); +} + +// Materialize a layered HNSW artifact + dataset into a standard hnswlib index file on disk, using +// bounded host memory and sequential disk I/O. +template +void materialize_layered_to_hnswlib_on_disk(raft::resources const& res, + const cuvs::util::file_descriptor& artifact_fd, + const cuvs::neighbors::hnsw::materialize_params& params, + const std::string& layered_artifact_path, + const std::string& output_path, + int dim, + cuvs::distance::DistanceType metric) +{ + static_assert(std::is_same_v, "Layered HNSW artifacts store ids as uint32_t"); + common::nvtx::range fun_scope("hnsw::materialize_to_hnswlib"); + const auto total_start_time = std::chrono::steady_clock::now(); + + // ---- Phase 0: read and validate the artifact header, descriptors and levels ---- + // Shared with the in-memory deserialize path so the format/offset arithmetic and every + // size-invariant check live in one place; this also bounds the buffer allocations below. + auto view = + read_and_validate_layered_artifact(artifact_fd, layered_artifact_path, dim, metric); + auto& metadata = view.metadata; + auto& levels_u8 = view.levels; + const auto base_nodes_offset = view.base_nodes_offset; + const auto base_links_offset = view.base_links_offset; + const auto upper_nodes_offset = view.upper_nodes_offset; + const auto upper_links_offset = view.upper_links_offset; + RAFT_EXPECTS(!params.dataset_path.empty(), + "Layered HNSW materialization requires materialize_params.dataset_path"); + + const size_t n_rows = metadata.n_rows; + + // Validate the dataset header. + auto dataset_file = open_layered_dataset_file(params.dataset_path); + RAFT_EXPECTS(dataset_file.shape.size() == 2 && dataset_file.shape[0] == n_rows && + dataset_file.shape[1] == metadata.dim, + "Layered HNSW dataset shape mismatch: artifact rows=%zu dim=%zu, dataset path=%s", + n_rows, + metadata.dim, + params.dataset_path.c_str()); + + // Retrieve the hnswlib layout constants from a dummy (single-element) index. + auto hnsw_index = std::make_unique>(dim, metric, HnswHierarchy::CPU); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), 1, metadata.M, metadata.ef_construction); + const size_t data_size = static_cast(dim) * sizeof(T); + RAFT_EXPECTS(appr_algo->size_links_level0_ == metadata.base_link_row_bytes, + "Layered HNSW base link row size mismatch"); + RAFT_EXPECTS(appr_algo->size_links_per_element_ == metadata.upper_link_row_bytes, + "Layered HNSW upper link row size mismatch"); + RAFT_EXPECTS(appr_algo->maxM0_ == metadata.maxM0 && appr_algo->maxM_ == metadata.maxM, + "Layered HNSW M parameter mismatch"); + RAFT_EXPECTS(appr_algo->data_size_ == data_size, "Layered HNSW data size mismatch"); + RAFT_EXPECTS(appr_algo->offsetData_ == appr_algo->size_links_level0_, + "Unexpected hnswlib data offset"); + RAFT_EXPECTS(appr_algo->label_offset_ == appr_algo->size_links_level0_ + data_size, + "Unexpected hnswlib label offset"); + + // ---- Output layout ---- + const auto native_header = make_hnswlib_native_header(appr_algo->offsetLevel0_, + n_rows, + appr_algo->size_data_per_element_, + appr_algo->label_offset_, + appr_algo->offsetData_, + metadata.maxlevel, + metadata.enterpoint_node, + metadata.maxM, + metadata.maxM0, + metadata.M, + metadata.mult, + metadata.ef_construction); + hnswlib_materialize_layout layout; + layout.n_rows = n_rows; + layout.dim = metadata.dim; + layout.base_link_row_bytes = metadata.base_link_row_bytes; + layout.upper_link_row_bytes = metadata.upper_link_row_bytes; + layout.size_data_per_element = appr_algo->size_data_per_element_; + layout.offset_data = appr_algo->offsetData_; + layout.label_offset = appr_algo->label_offset_; + layout.data_size = data_size; + layout.base_region_offset = native_header.size(); + layout.base_nodes_offset = base_nodes_offset; + layout.base_links_offset = base_links_offset; + layout.upper_nodes_offset = upper_nodes_offset; + layout.upper_links_offset = upper_links_offset; + + const size_t base_region_bytes = n_rows * layout.size_data_per_element; + layout.upper_region_offset = layout.base_region_offset + base_region_bytes; + + size_t upper_region_bytes = 0; + for (size_t i = 0; i < n_rows; ++i) { + upper_region_bytes += sizeof(int); + const size_t level = static_cast(levels_u8[i]); + if (level > 0) { upper_region_bytes += level * layout.upper_link_row_bytes; } + } + const size_t final_size = layout.upper_region_offset + upper_region_bytes; + + cuvs::util::file_descriptor output_fd(output_path, O_CREAT | O_RDWR | O_TRUNC, 0644); + cuvs::util::preallocate_file(output_fd, final_size); + cuvs::util::write_large_file(output_fd, native_header.data(), native_header.size(), 0); + RAFT_LOG_INFO( + "hnswlib materialize: writing index (rows=%zu dim=%zu, output=%.2f GiB, dataset=%s)", + n_rows, + metadata.dim, + to_gib(final_size), + params.dataset_path.c_str()); + + auto num_threads = + params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads; + // max_host_memory_gb <= 0 => no host-memory cap: both regions take their single in-memory pass + // and create no temporary files. Otherwise reserve headroom for the fixed-size streaming I/O + // buffers (dataset batch, output buffer, per-call scatter read buffers, each ~64 MiB) so peak + // host memory stays close to the requested budget rather than overshooting it by the buffer + // overhead. + size_t budget_bytes = std::numeric_limits::max(); + if (params.max_host_memory_gb > 0.0) { + const size_t requested = + std::max(1, static_cast(params.max_host_memory_gb * (size_t{1} << 30))); + constexpr size_t kStreamingBufferReserve = size_t{4} * (size_t{64} << 20); // ~256 MiB + budget_bytes = requested > kStreamingBufferReserve ? requested - kStreamingBufferReserve + : std::max(1, requested / 2); + } + + std::filesystem::path tmp_dir = output_path; + tmp_dir += ".materialize_tmp"; + + materialize_hnswlib_base_region( + artifact_fd, dataset_file, output_fd, metadata, layout, budget_bytes, num_threads, tmp_dir); + materialize_hnswlib_upper_region( + artifact_fd, output_fd, metadata, layout, levels_u8, budget_bytes, tmp_dir); + + std::error_code ec; + std::filesystem::remove_all(tmp_dir, ec); + + RAFT_LOG_INFO("hnswlib materialize: completed in %ld ms (output %.2f GiB)", + elapsed_ms_since(total_start_time), + to_gib(final_size)); +} + +// Reads the dtype tag from the artifact header and dispatches to the typed implementation. +inline void materialize_layered_to_hnswlib_on_disk_dispatch( + raft::resources const& res, + const cuvs::neighbors::hnsw::materialize_params& params, + const std::string& layered_artifact_path, + const std::string& output_path, + int dim, + cuvs::distance::DistanceType metric) +{ + // Open the artifact once and read just the dtype tag to pick T; the typed worker reuses this fd + // (no second open) and performs full header validation via read_and_validate_layered_artifact. + cuvs::util::file_descriptor artifact_fd(layered_artifact_path, O_RDONLY); + layered_hnsw_file_header header{}; + cuvs::util::read_large_file(artifact_fd, &header, sizeof(header), 0); + RAFT_EXPECTS(std::strncmp(header.magic, layered_hnsw_magic, sizeof(header.magic)) == 0, + "Invalid layered HNSW artifact magic: %s", + layered_artifact_path.c_str()); + switch (static_cast(header.dtype)) { + case layered_hnsw_dtype::float32: + materialize_layered_to_hnswlib_on_disk( + res, artifact_fd, params, layered_artifact_path, output_path, dim, metric); + break; + case layered_hnsw_dtype::float16: + materialize_layered_to_hnswlib_on_disk( + res, artifact_fd, params, layered_artifact_path, output_path, dim, metric); + break; + case layered_hnsw_dtype::uint8: + materialize_layered_to_hnswlib_on_disk( + res, artifact_fd, params, layered_artifact_path, output_path, dim, metric); + break; + case layered_hnsw_dtype::int8: + materialize_layered_to_hnswlib_on_disk( + res, artifact_fd, params, layered_artifact_path, output_path, dim, metric); + break; + default: RAFT_FAIL("Unsupported layered HNSW artifact dtype %u", header.dtype); + } +} + template void deserialize(raft::resources const& res, const index_params& params, @@ -1249,6 +3160,12 @@ void deserialize(raft::resources const& res, cuvs::distance::DistanceType metric, index** idx) { + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + auto hnsw_index = deserialize_layered_hnsw(res, params, filename, dim, metric); + *idx = hnsw_index.release(); + return; + } + try { auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); auto appr_algo = std::make_unique::type>>( @@ -1286,6 +3203,13 @@ std::unique_ptr> build(raft::resources const& res, ? std::get(params.graph_build_params) : graph_build_params::ace_params{}; + if (params.hierarchy == HnswHierarchy::GPU_LAYERED_ON_DISK) { + RAFT_EXPECTS(ace_params.use_disk, + "GPU_LAYERED_ON_DISK requires ACE disk mode (ace_params.use_disk = true)"); + RAFT_EXPECTS(!ace_params.build_dir.empty(), + "GPU_LAYERED_ON_DISK requires ace_params.build_dir to be set"); + } + // Create CAGRA index parameters from HNSW parameters cuvs::neighbors::cagra::index_params cagra_params; cagra_params.metric = params.metric; @@ -1302,15 +3226,14 @@ std::unique_ptr> build(raft::resources const& res, cagra_ace_params.max_gpu_memory_gb = ace_params.max_gpu_memory_gb; cagra_params.graph_build_params = cagra_ace_params; - RAFT_LOG_INFO( - "hnsw::build - Building HNSW index using ACE with %zu partitions, ef_construction=%zu", - ace_params.npartitions, - ace_params.ef_construction); + RAFT_LOG_INFO("HNSW ACE build: building CAGRA graph (partitions=%zu ef_construction=%zu)", + cagra_ace_params.npartitions, + cagra_ace_params.ef_construction); // Build CAGRA index using ACE auto cagra_index = cuvs::neighbors::cagra::build(res, cagra_params, dataset); - RAFT_LOG_INFO("hnsw::build - Converting CAGRA index to HNSW format"); + RAFT_LOG_INFO("HNSW ACE build: converting CAGRA graph to HNSW hierarchy"); // Convert CAGRA index to HNSW index return from_cagra(res, params, cagra_index, dataset); diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp index 54e9dcf12a..a3d6ca3ab6 100644 --- a/cpp/src/neighbors/hnsw.cpp +++ b/cpp/src/neighbors/hnsw.cpp @@ -115,4 +115,17 @@ CUVS_INST_HNSW_SERIALIZE(int8_t); #undef CUVS_INST_HNSW_SERIALIZE +// The element data type is read from the artifact header; the dispatcher selects the typed +// implementation, instantiating the float/half/uint8_t/int8_t materialize paths in this TU. +void materialize_to_hnswlib(raft::resources const& res, + const materialize_params& params, + const std::string& layered_artifact_path, + const std::string& output_path, + int dim, + cuvs::distance::DistanceType metric) +{ + detail::materialize_layered_to_hnswlib_on_disk_dispatch( + res, params, layered_artifact_path, output_path, dim, metric); +} + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/src/util/file_io.cpp b/cpp/src/util/file_io.cpp index d924527e72..dac930cad2 100644 --- a/cpp/src/util/file_io.cpp +++ b/cpp/src/util/file_io.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,10 +10,29 @@ #include #include +#include #include namespace cuvs::util { +void preallocate_file(const file_descriptor& fd, const size_t total_bytes) +{ + if (total_bytes == 0) { return; } + RAFT_EXPECTS(fd.is_valid(), "File descriptor must be valid"); + const int rc = posix_fallocate(fd.get(), 0, static_cast(total_bytes)); + if (rc == 0) { return; } + // Some filesystems (tmpfs, certain NFS/overlay mounts) do not support preallocation; fall back + // to ftruncate so a valid output location is still usable. + if (rc == EOPNOTSUPP || rc == EINVAL || rc == ENOSYS) { + RAFT_EXPECTS(ftruncate(fd.get(), static_cast(total_bytes)) == 0, + "Failed to pre-size file %s via ftruncate: %s", + fd.get_path().c_str(), + strerror(errno)); + return; + } + RAFT_FAIL("Failed to pre-allocate file %s: %s", fd.get_path().c_str(), strerror(rc)); +} + void read_large_file(const file_descriptor& fd, void* dest_ptr, const size_t total_bytes, diff --git a/cpp/tests/neighbors/ann_hnsw_ace.cuh b/cpp/tests/neighbors/ann_hnsw_ace.cuh index 78ae54d9fc..f0eeaa51ad 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace.cuh +++ b/cpp/tests/neighbors/ann_hnsw_ace.cuh @@ -7,9 +7,18 @@ #include "ann_cagra.cuh" #include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include namespace cuvs::neighbors::hnsw { @@ -259,6 +268,438 @@ class AnnHnswAceTest : public ::testing::TestWithParam { std::filesystem::remove_all(temp_dir); } + void testHnswAceLayeredBuildDeserializeSearch() + { + size_t queries_size = ps.n_queries * ps.k; + std::vector indexes_naive(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indexes_naive_dev(queries_size, stream_); + + cuvs::neighbors::naive_knn(handle_, + distances_naive_dev.data(), + indexes_naive_dev.data(), + search_queries.data(), + database_dev.data(), + ps.n_queries, + ps.n_rows, + ps.dim, + ps.k, + ps.metric); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_); + raft::resource::sync_stream(handle_); + } + + std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_layered_test_") + + std::to_string(std::time(nullptr)) + "_" + + std::to_string(reinterpret_cast(this)); + std::filesystem::create_directories(temp_dir); + struct temp_dir_cleanup { + std::string path; + ~temp_dir_cleanup() + { + std::error_code ec; + std::filesystem::remove_all(path, ec); + } + } cleanup{temp_dir}; + + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_); + auto queries_host = raft::make_host_matrix(ps.n_queries, ps.dim); + raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_); + raft::resource::sync_stream(handle_); + + const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string(); + auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file( + dataset_file, {static_cast(ps.n_rows), static_cast(ps.dim)}); + cuvs::util::write_large_file(dataset_fd, + database_host.data_handle(), + static_cast(ps.n_rows) * ps.dim * sizeof(DataT), + dataset_header_size); + + hnsw::index_params hnsw_params; + hnsw_params.metric = ps.metric; + hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; + hnsw_params.M = 32; + hnsw_params.ef_construction = ps.ef_construction; + hnsw_params.dataset_path = dataset_file; + + auto ace_params = graph_build_params::ace_params(); + ace_params.npartitions = ps.npartitions; + ace_params.build_dir = temp_dir; + ace_params.use_disk = true; + ace_params.max_host_memory_gb = ps.max_host_memory_gb; + ace_params.max_gpu_memory_gb = ps.max_gpu_memory_gb; + hnsw_params.graph_build_params = ace_params; + + auto hnsw_index = + hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view())); + ASSERT_NE(hnsw_index, nullptr); + + const auto artifact_path = hnsw_index->file_path(); + ASSERT_FALSE(artifact_path.empty()); + ASSERT_TRUE(std::filesystem::is_regular_file(artifact_path)); + EXPECT_EQ(std::filesystem::path(artifact_path).filename().string(), "hnsw_index.cuvs"); + size_t cuvs_artifact_count = 0; + for (const auto& entry : std::filesystem::directory_iterator(temp_dir)) { + if (entry.path().extension() == ".cuvs") { ++cuvs_artifact_count; } + } + EXPECT_EQ(cuvs_artifact_count, 1); + EXPECT_FALSE(std::filesystem::exists(std::filesystem::path(temp_dir) / "layered_hnsw")); + + auto indexes_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + auto distances_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + + hnsw::search_params search_params; + search_params.ef = std::max(ps.ef_construction, ps.k * 2); + search_params.num_threads = 1; + + EXPECT_THROW(hnsw::search(handle_, + search_params, + *hnsw_index, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()), + std::exception); + + hnsw::index* deserialized_index = nullptr; + hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &deserialized_index); + ASSERT_NE(deserialized_index, nullptr); + std::unique_ptr> deserialized_guard(deserialized_index); + + hnsw::search(handle_, + search_params, + *deserialized_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + + std::vector indexes_hnsw_converted(queries_size); + std::vector distances_hnsw(queries_size); + for (size_t i = 0; i < queries_size; i++) { + indexes_hnsw_converted[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_hnsw[i] = distances_hnsw_host.data_handle()[i]; + } + + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive, + indexes_hnsw_converted, + distances_naive, + distances_hnsw, + ps.n_queries, + ps.k, + 0.003, + ps.min_recall)) + << "Layered HNSW deserialize and search failed recall check"; + + const auto copied_artifact = + (std::filesystem::path(temp_dir) / "copied_layered" / "hnsw_index.cuvs").string(); + hnsw::serialize(handle_, copied_artifact, *hnsw_index); + EXPECT_TRUE(std::filesystem::is_regular_file(copied_artifact)); + size_t copied_file_count = 0; + for (const auto& entry : std::filesystem::directory_iterator( + std::filesystem::path(copied_artifact).parent_path())) { + if (entry.is_regular_file()) { ++copied_file_count; } + } + EXPECT_EQ(copied_file_count, 1); + + hnsw::index* copied_index = nullptr; + hnsw::deserialize(handle_, hnsw_params, copied_artifact, ps.dim, ps.metric, &copied_index); + ASSERT_NE(copied_index, nullptr); + std::unique_ptr> copied_guard(copied_index); + + hnsw::search(handle_, + search_params, + *copied_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + + for (size_t i = 0; i < queries_size; i++) { + indexes_hnsw_converted[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_hnsw[i] = distances_hnsw_host.data_handle()[i]; + } + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive, + indexes_hnsw_converted, + distances_naive, + distances_hnsw, + ps.n_queries, + ps.k, + 0.003, + ps.min_recall)) + << "Copied layered HNSW artifact deserialize and search failed recall check"; + + const auto bad_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "bad_magic.cuvs").string(); + std::filesystem::create_directories(std::filesystem::path(bad_artifact).parent_path()); + { + std::ofstream bad_file(bad_artifact, std::ios::binary); + std::array bad_bytes{}; + bad_file.write(bad_bytes.data(), bad_bytes.size()); + } + hnsw::index* bad_index = nullptr; + EXPECT_THROW( + hnsw::deserialize(handle_, hnsw_params, bad_artifact, ps.dim, ps.metric, &bad_index), + std::exception); + + const auto bad_version_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "bad_version.cuvs").string(); + std::filesystem::copy_file( + copied_artifact, bad_version_artifact, std::filesystem::copy_options::overwrite_existing); + constexpr std::streamoff layered_header_version_offset = 32; + { + std::fstream bad_version_file(bad_version_artifact, + std::ios::in | std::ios::out | std::ios::binary); + const uint32_t bad_version = 999; + bad_version_file.seekp(layered_header_version_offset); + bad_version_file.write(reinterpret_cast(&bad_version), sizeof(bad_version)); + } + hnsw::index* bad_version_index = nullptr; + EXPECT_THROW( + hnsw::deserialize( + handle_, hnsw_params, bad_version_artifact, ps.dim, ps.metric, &bad_version_index), + std::exception); + + auto missing_dataset_params = hnsw_params; + missing_dataset_params.dataset_path.clear(); + hnsw::index* missing_dataset_index = nullptr; + EXPECT_THROW(hnsw::deserialize(handle_, + missing_dataset_params, + copied_artifact, + ps.dim, + ps.metric, + &missing_dataset_index), + std::exception); + + const auto truncated_artifact = + (std::filesystem::path(temp_dir) / "bad_layered" / "truncated.cuvs").string(); + std::filesystem::copy_file( + copied_artifact, truncated_artifact, std::filesystem::copy_options::overwrite_existing); + std::filesystem::resize_file(truncated_artifact, 128); + hnsw::index* truncated_index = nullptr; + EXPECT_THROW(hnsw::deserialize( + handle_, hnsw_params, truncated_artifact, ps.dim, ps.metric, &truncated_index), + std::exception); + + const auto wrong_dataset_file = + (std::filesystem::path(temp_dir) / "bad_layered" / "wrong_dataset.npy").string(); + auto [wrong_dataset_fd, wrong_dataset_header_size] = cuvs::util::create_numpy_file( + wrong_dataset_file, {static_cast(ps.n_rows - 1), static_cast(ps.dim)}); + cuvs::util::write_large_file(wrong_dataset_fd, + database_host.data_handle(), + static_cast(ps.n_rows - 1) * ps.dim * sizeof(DataT), + wrong_dataset_header_size); + auto wrong_dataset_params = hnsw_params; + wrong_dataset_params.dataset_path = wrong_dataset_file; + hnsw::index* wrong_dataset_index = nullptr; + EXPECT_THROW( + hnsw::deserialize( + handle_, wrong_dataset_params, copied_artifact, ps.dim, ps.metric, &wrong_dataset_index), + std::exception); + } + + void testHnswAceLayeredMaterializeToHnswlib() + { + size_t queries_size = ps.n_queries * ps.k; + std::vector indexes_naive(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indexes_naive_dev(queries_size, stream_); + cuvs::neighbors::naive_knn(handle_, + distances_naive_dev.data(), + indexes_naive_dev.data(), + search_queries.data(), + database_dev.data(), + ps.n_queries, + ps.n_rows, + ps.dim, + ps.k, + ps.metric); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + raft::update_host(indexes_naive.data(), indexes_naive_dev.data(), queries_size, stream_); + raft::resource::sync_stream(handle_); + } + + std::string temp_dir = std::string("/tmp/cuvs_hnsw_ace_materialize_test_") + + std::to_string(std::time(nullptr)) + "_" + + std::to_string(reinterpret_cast(this)); + std::filesystem::create_directories(temp_dir); + struct temp_dir_cleanup { + std::string path; + ~temp_dir_cleanup() + { + std::error_code ec; + std::filesystem::remove_all(path, ec); + } + } cleanup{temp_dir}; + + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_host.data_handle(), database_dev.data(), ps.n_rows * ps.dim, stream_); + auto queries_host = raft::make_host_matrix(ps.n_queries, ps.dim); + raft::copy(queries_host.data_handle(), search_queries.data(), ps.n_queries * ps.dim, stream_); + raft::resource::sync_stream(handle_); + + const auto dataset_file = (std::filesystem::path(temp_dir) / "dataset.npy").string(); + auto [dataset_fd, dataset_header_size] = cuvs::util::create_numpy_file( + dataset_file, {static_cast(ps.n_rows), static_cast(ps.dim)}); + cuvs::util::write_large_file(dataset_fd, + database_host.data_handle(), + static_cast(ps.n_rows) * ps.dim * sizeof(DataT), + dataset_header_size); + + hnsw::index_params hnsw_params; + hnsw_params.metric = ps.metric; + hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; + hnsw_params.M = 32; + hnsw_params.ef_construction = ps.ef_construction; + hnsw_params.dataset_path = dataset_file; + + auto ace_params = graph_build_params::ace_params(); + ace_params.npartitions = ps.npartitions; + ace_params.build_dir = temp_dir; + ace_params.use_disk = true; + ace_params.max_host_memory_gb = ps.max_host_memory_gb; + ace_params.max_gpu_memory_gb = ps.max_gpu_memory_gb; + hnsw_params.graph_build_params = ace_params; + + auto hnsw_index = + hnsw::build(handle_, hnsw_params, raft::make_const_mdspan(database_host.view())); + ASSERT_NE(hnsw_index, nullptr); + const auto artifact_path = hnsw_index->file_path(); + ASSERT_FALSE(artifact_path.empty()); + + hnsw::search_params search_params; + search_params.ef = std::max(ps.ef_construction, ps.k * 2); + search_params.num_threads = 1; + + auto indexes_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + auto distances_hnsw_host = raft::make_host_matrix(ps.n_queries, ps.k); + + // Reference: load the layered artifact in RAM and search. + std::vector indexes_layered(queries_size); + std::vector distances_layered(queries_size); + { + hnsw::index* layered_index = nullptr; + hnsw::deserialize(handle_, hnsw_params, artifact_path, ps.dim, ps.metric, &layered_index); + ASSERT_NE(layered_index, nullptr); + std::unique_ptr> layered_guard(layered_index); + hnsw::search(handle_, + search_params, + *layered_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + for (size_t i = 0; i < queries_size; i++) { + indexes_layered[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_layered[i] = distances_hnsw_host.data_handle()[i]; + } + } + + // Materialize to a standard hnswlib index file twice: single in-memory pass and bucketed. + const auto out_single = (std::filesystem::path(temp_dir) / "materialized_single.bin").string(); + const auto out_bucketed = + (std::filesystem::path(temp_dir) / "materialized_bucketed.bin").string(); + + hnsw::materialize_params materialize_single; + materialize_single.dataset_path = dataset_file; + materialize_single.max_host_memory_gb = 0; // single in-memory reorder pass + hnsw::materialize_to_hnswlib( + handle_, materialize_single, artifact_path, out_single, ps.dim, ps.metric); + + hnsw::materialize_params materialize_bucketed; + materialize_bucketed.dataset_path = dataset_file; + materialize_bucketed.max_host_memory_gb = 0.00003; // tiny budget forces bucketed base + upper + hnsw::materialize_to_hnswlib( + handle_, materialize_bucketed, artifact_path, out_bucketed, ps.dim, ps.metric); + + ASSERT_TRUE(std::filesystem::is_regular_file(out_single)); + ASSERT_TRUE(std::filesystem::is_regular_file(out_bucketed)); + + // Determinism: single-pass and bucketed outputs must be byte-identical. + { + const auto read_all = [](const std::string& path) { + std::ifstream stream(path, std::ios::binary); + return std::vector((std::istreambuf_iterator(stream)), + std::istreambuf_iterator()); + }; + const auto bytes_single = read_all(out_single); + const auto bytes_bucketed = read_all(out_bucketed); + ASSERT_FALSE(bytes_single.empty()); + EXPECT_EQ(bytes_single.size(), bytes_bucketed.size()); + EXPECT_TRUE(bytes_single == bytes_bucketed) + << "Single-pass and bucketed materialized outputs differ"; + } + + // Load the materialized file as a standard (CPU) hnswlib index and search. + hnsw::index_params cpu_params; + cpu_params.metric = ps.metric; + cpu_params.hierarchy = hnsw::HnswHierarchy::CPU; + hnsw::index* cpu_index = nullptr; + hnsw::deserialize(handle_, cpu_params, out_single, ps.dim, ps.metric, &cpu_index); + ASSERT_NE(cpu_index, nullptr); + std::unique_ptr> cpu_guard(cpu_index); + + hnsw::search(handle_, + search_params, + *cpu_guard, + queries_host.view(), + indexes_hnsw_host.view(), + distances_hnsw_host.view()); + + std::vector indexes_cpu(queries_size); + std::vector distances_cpu(queries_size); + for (size_t i = 0; i < queries_size; i++) { + indexes_cpu[i] = static_cast(indexes_hnsw_host.data_handle()[i]); + distances_cpu[i] = distances_hnsw_host.data_handle()[i]; + } + + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_naive, + indexes_cpu, + distances_naive, + distances_cpu, + ps.n_queries, + ps.k, + 0.003, + ps.min_recall)) + << "Materialized hnswlib index failed recall check vs. ground truth"; + + // The materialized CPU index represents the same graph and vectors as the layered artifact, + // so its search results must match the in-memory layered path. + EXPECT_TRUE(cuvs::neighbors::eval_neighbours(indexes_layered, + indexes_cpu, + distances_layered, + distances_cpu, + ps.n_queries, + ps.k, + 0.003, + 0.99)) + << "Materialized hnswlib index disagrees with the in-memory layered path"; + + hnsw::materialize_params bad_params; + bad_params.dataset_path = dataset_file; + const auto err_out = (std::filesystem::path(temp_dir) / "err.bin").string(); + EXPECT_THROW(hnsw::materialize_to_hnswlib( + handle_, bad_params, artifact_path, err_out, ps.dim + 1, ps.metric), + std::exception); + + const auto wrong_metric = ps.metric == cuvs::distance::DistanceType::L2Expanded + ? cuvs::distance::DistanceType::InnerProduct + : cuvs::distance::DistanceType::L2Expanded; + EXPECT_THROW(hnsw::materialize_to_hnswlib( + handle_, bad_params, artifact_path, err_out, ps.dim, wrong_metric), + std::exception); + + hnsw::materialize_params missing_dataset; + missing_dataset.dataset_path.clear(); + EXPECT_THROW(hnsw::materialize_to_hnswlib( + handle_, missing_dataset, artifact_path, err_out, ps.dim, ps.metric), + std::exception); + } + void SetUp() override { database_dev.resize(((size_t)ps.n_rows) * ps.dim, stream_); @@ -323,8 +764,26 @@ inline std::vector generate_hnsw_ace_memory_fallback_inputs() }; } +inline std::vector generate_hnsw_ace_layered_inputs() +{ + return { + {10, // n_queries + 5000, // n_rows + 64, // dim + 10, // k + 2, // npartitions + 100, // ef_construction + true, // use_disk + cuvs::distance::DistanceType::L2Expanded, + 0.9, // min_recall + 0.0, // max_host_memory_gb + 0.0} // max_gpu_memory_gb + }; +} + const std::vector hnsw_ace_inputs = generate_hnsw_ace_inputs(); const std::vector hnsw_ace_memory_fallback_inputs = generate_hnsw_ace_memory_fallback_inputs(); +const std::vector hnsw_ace_layered_inputs = generate_hnsw_ace_layered_inputs(); } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu index 57b16d74e9..33a272a675 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_float_uint32_t.cu @@ -23,4 +23,24 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, AnnHnswAceMemoryFallbackTest_float, ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); +typedef AnnHnswAceTest AnnHnswAceLayeredTest_float; +TEST_P(AnnHnswAceLayeredTest_float, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_float, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + +typedef AnnHnswAceTest AnnHnswAceMaterializeTest_float; +TEST_P(AnnHnswAceMaterializeTest_float, AnnHnswAceLayeredMaterializeToHnswlib) +{ + this->testHnswAceLayeredMaterializeToHnswlib(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest, + AnnHnswAceMaterializeTest_float, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu index 38f75b2afa..acde405a3b 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_half_uint32_t.cu @@ -12,4 +12,34 @@ TEST_P(AnnHnswAceTest_half, AnnHnswAceBuild) { this->testHnswAceBuild(); } INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_half, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_half; +TEST_P(AnnHnswAceMemoryFallbackTest_half, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_half, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_half; +TEST_P(AnnHnswAceLayeredTest_half, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_half, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + +typedef AnnHnswAceTest AnnHnswAceMaterializeTest_half; +TEST_P(AnnHnswAceMaterializeTest_half, AnnHnswAceLayeredMaterializeToHnswlib) +{ + this->testHnswAceLayeredMaterializeToHnswlib(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest, + AnnHnswAceMaterializeTest_half, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu index 279df6555f..a6fe986898 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_int8_t_uint32_t.cu @@ -14,4 +14,34 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_int8_t, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_int8_t; +TEST_P(AnnHnswAceMemoryFallbackTest_int8_t, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_int8_t, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_int8_t; +TEST_P(AnnHnswAceLayeredTest_int8_t, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_int8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + +typedef AnnHnswAceTest AnnHnswAceMaterializeTest_int8_t; +TEST_P(AnnHnswAceMaterializeTest_int8_t, AnnHnswAceLayeredMaterializeToHnswlib) +{ + this->testHnswAceLayeredMaterializeToHnswlib(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest, + AnnHnswAceMaterializeTest_int8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu index 7e68dc4b17..13c553f260 100644 --- a/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu +++ b/cpp/tests/neighbors/ann_hnsw_ace/test_uint8_t_uint32_t.cu @@ -14,4 +14,34 @@ INSTANTIATE_TEST_CASE_P(AnnHnswAceTest, AnnHnswAceTest_uint8_t, ::testing::ValuesIn(hnsw_ace_inputs)); +typedef AnnHnswAceTest AnnHnswAceMemoryFallbackTest_uint8_t; +TEST_P(AnnHnswAceMemoryFallbackTest_uint8_t, AnnHnswAceMemoryLimitFallback) +{ + this->testHnswAceMemoryLimitFallback(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMemoryFallbackTest, + AnnHnswAceMemoryFallbackTest_uint8_t, + ::testing::ValuesIn(hnsw_ace_memory_fallback_inputs)); + +typedef AnnHnswAceTest AnnHnswAceLayeredTest_uint8_t; +TEST_P(AnnHnswAceLayeredTest_uint8_t, AnnHnswAceLayeredBuildDeserializeSearch) +{ + this->testHnswAceLayeredBuildDeserializeSearch(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceLayeredTest, + AnnHnswAceLayeredTest_uint8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + +typedef AnnHnswAceTest AnnHnswAceMaterializeTest_uint8_t; +TEST_P(AnnHnswAceMaterializeTest_uint8_t, AnnHnswAceLayeredMaterializeToHnswlib) +{ + this->testHnswAceLayeredMaterializeToHnswlib(); +} + +INSTANTIATE_TEST_CASE_P(AnnHnswAceMaterializeTest, + AnnHnswAceMaterializeTest_uint8_t, + ::testing::ValuesIn(hnsw_ace_layered_inputs)); + } // namespace cuvs::neighbors::hnsw diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index d63ddbdb71..7a36217bae 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -36,6 +36,7 @@ add_executable(CAGRA_HNSW_ACE_EXAMPLE src/cagra_hnsw_ace_example.cu) add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu) add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu) add_executable(HNSW_ACE_EXAMPLE src/hnsw_ace_example.cu) +add_executable(HNSW_ACE_LAYERED_EXAMPLE src/hnsw_ace_layered_example.cu) add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu) add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu) add_executable(VAMANA_EXAMPLE src/vamana_example.cu) @@ -56,6 +57,7 @@ target_link_libraries( DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $ Threads::Threads ) target_link_libraries(HNSW_ACE_EXAMPLE PRIVATE cuvs::cuvs $) +target_link_libraries(HNSW_ACE_LAYERED_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $) diff --git a/examples/cpp/src/hnsw_ace_layered_example.cu b/examples/cpp/src/hnsw_ace_layered_example.cu new file mode 100644 index 0000000000..ced22ff3fe --- /dev/null +++ b/examples/cpp/src/hnsw_ace_layered_example.cu @@ -0,0 +1,354 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Large graphs exceeding the memory capacity can be built using the Augmented Core Extraction (ACE) +// algorithm, which partitions the dataset. The resulting HNSW index is too large to fit in memory +// as well. Thus, the index needs to be transferred to a search server with enough memory. +// +// HNSWHierarchy::GPU_LAYERED_ON_DISK is a special hierarchy that builds a layered HNSW index on +// disk. It emits one topology-only artifact, hnsw_index.cuvs. The dataset remains separate and does +// not need to be transferred to the search server, which typically has the dataset locally. +// +// This example demonstrates how to build a layered HNSW index with ACE and turn it into a standard +// hnswlib index for in-memory search: +// +// 1. Optionally quantize the dataset and queries to int8. +// 2. Build a single-file layered HNSW artifact with ACE using hnsw::build. +// 3. Materialize the layered artifact into a standard hnswlib index file on disk using +// hnsw::materialize_to_hnswlib (disk-to-disk, never holding the full index in host memory). +// 4. Read the materialized hnswlib index into memory using hnsw::deserialize (hierarchy = CPU). +// 5. Search the in-memory HNSW index. +// +// Layered-on-disk layout: +// +// index_dir/hnsw_index.cuvs +// fixed header + metadata JSON +// levels: uint8 [N], max HNSW level for each original row id +// base nodes + base links: uint32 node ids with hnswlib-ready link rows +// upper nodes + upper links: hnswlib-ready upper-layer topology +// +// The transferred index artifact is topology-only. The dataset is loaded locally during +// materialization from hnsw::materialize_params::dataset_path. The loader supports .npy and ANN +// benchmark *.bin datasets; this example writes a local dataset .npy only to make the demo +// self-contained. The materialized hnswlib index file is self-contained (it embeds the vectors), +// so reading it back needs no dataset path. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "common.cuh" + +// When 1, scalar-quantize the float dataset to int8. +#define HNSW_ACE_LAYERED_USE_QUANTIZATION 1 + +namespace { + +constexpr const char* kBuildDir = "/tmp/hnsw_ace_layered"; + +// Reports the wall-clock time of a callable in milliseconds. +template +double time_ms(F&& fn) +{ + const auto start = std::chrono::steady_clock::now(); + fn(); + return std::chrono::duration(std::chrono::steady_clock::now() - start) + .count(); +} + +double to_gib(double bytes) { return bytes / (1024.0 * 1024.0 * 1024.0); } + +template +std::string write_local_dataset(raft::host_matrix_view dataset, + const std::string& path) +{ + auto [fd, header_size] = cuvs::util::create_numpy_file( + path, {static_cast(dataset.extent(0)), static_cast(dataset.extent(1))}); + cuvs::util::write_large_file( + fd, dataset.data_handle(), dataset.extent(0) * dataset.extent(1) * sizeof(T), header_size); + return path; +} + +template +struct quantized_pair { + raft::host_matrix dataset; + raft::host_matrix queries; +}; + +quantized_pair quantize_dataset(raft::device_resources const& dev_resources, + raft::host_matrix_view dataset_float, + raft::host_matrix_view queries_float) +{ + std::cout << " quantize_dataset: training scalar quantizer (float -> int8)" << std::endl; + cuvs::preprocessing::quantize::scalar::params qp; + auto quantizer = cuvs::preprocessing::quantize::scalar::train(dev_resources, qp, dataset_float); + + auto dataset_i8 = + raft::make_host_matrix(dataset_float.extent(0), dataset_float.extent(1)); + cuvs::preprocessing::quantize::scalar::transform( + dev_resources, quantizer, dataset_float, dataset_i8.view()); + + auto queries_i8 = + raft::make_host_matrix(queries_float.extent(0), queries_float.extent(1)); + cuvs::preprocessing::quantize::scalar::transform( + dev_resources, quantizer, queries_float, queries_i8.view()); + + return {std::move(dataset_i8), std::move(queries_i8)}; +} + +auto make_hnsw_ace_params(const std::string& build_dir, const std::string& dataset_path) + -> cuvs::neighbors::hnsw::index_params +{ + using namespace cuvs::neighbors; + + hnsw::index_params hnsw_params; + hnsw_params.metric = cuvs::distance::DistanceType::L2Expanded; + hnsw_params.hierarchy = hnsw::HnswHierarchy::GPU_LAYERED_ON_DISK; + hnsw_params.M = 32; + hnsw_params.dataset_path = dataset_path; // Override this path on the search server. + hnsw_params.ef_construction = 120; + + auto ace_params = hnsw::graph_build_params::ace_params(); + ace_params.npartitions = 4; + ace_params.use_disk = true; + ace_params.build_dir = build_dir; + hnsw_params.graph_build_params = ace_params; + + return hnsw_params; +} + +template +auto hnsw_build(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index_params& hnsw_params, + raft::host_matrix_view dataset) -> std::string +{ + using namespace cuvs::neighbors; + + std::unique_ptr> hnsw_index; + const auto build_ms = + time_ms([&]() { hnsw_index = hnsw::build(dev_resources, hnsw_params, dataset); }); + const auto artifact_path = hnsw_index->file_path(); + if (artifact_path.empty()) { + throw std::runtime_error("Expected layered HNSW build to return an artifact path."); + } + const auto artifact_bytes = static_cast(std::filesystem::file_size(artifact_path)); + std::cout << " hnsw_build: layered artifact written to " << artifact_path << "\n" + << " hnsw_build: build wall time " << build_ms << " ms, artifact " + << to_gib(artifact_bytes) << " GiB" << std::endl; + return artifact_path; +} + +// Materialize the layered artifact into a standard hnswlib index file on disk and time the +// disk-to-disk materialization. Returns the path to the native hnswlib index file. +template +auto hnsw_materialize(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index_params& hnsw_params, + const std::string& artifact_path, + const std::string& dataset_path, + int64_t dim, + const std::string& output_path) -> std::string +{ + using namespace cuvs::neighbors; + + hnsw::materialize_params materialize_params; + materialize_params.dataset_path = dataset_path; + materialize_params.max_host_memory_gb = 0; // 0 => single in-memory reorder pass + materialize_params.num_threads = 0; // 0 => max threads + + const auto materialize_ms = time_ms([&]() { + hnsw::materialize_to_hnswlib(dev_resources, + materialize_params, + artifact_path, + output_path, + static_cast(dim), + hnsw_params.metric); + }); + + const auto native_bytes = static_cast(std::filesystem::file_size(output_path)); + std::cout << " hnsw_materialize: native hnswlib index written to " << output_path << "\n" + << " hnsw_materialize: wall time " << materialize_ms << " ms, output " + << to_gib(native_bytes) << " GiB" << std::endl; + return output_path; +} + +// Read the materialized hnswlib index into memory for search. The materialized file is a standard +// hnswlib index, so it is loaded with hierarchy == CPU and needs no dataset path (the file already +// embeds the vectors). +template +auto hnsw_load_native(raft::device_resources const& dev_resources, + const std::string& native_index_path, + cuvs::distance::DistanceType metric, + int64_t dim) -> std::unique_ptr> +{ + using namespace cuvs::neighbors; + + hnsw::index_params load_params; + load_params.hierarchy = hnsw::HnswHierarchy::CPU; + load_params.metric = metric; + + hnsw::index* loaded_index = nullptr; + hnsw::deserialize( + dev_resources, load_params, native_index_path, static_cast(dim), metric, &loaded_index); + return std::unique_ptr>(loaded_index); +} + +template +void hnsw_search(raft::device_resources const& dev_resources, + const cuvs::neighbors::hnsw::index& hnsw_index, + raft::host_matrix_view queries, + int64_t topk = 12) +{ + using namespace cuvs::neighbors; + + const int64_t n_queries = queries.extent(0); + auto indices_hnsw_host = raft::make_host_matrix(n_queries, topk); + auto distances_hnsw_host = raft::make_host_matrix(n_queries, topk); + + hnsw::search_params search_params; + search_params.ef = std::max(200, static_cast(topk) * 2); + search_params.num_threads = 1; + + hnsw::search(dev_resources, + search_params, + hnsw_index, + queries, + indices_hnsw_host.view(), + distances_hnsw_host.view()); + + auto neighbors = raft::make_device_matrix(dev_resources, n_queries, topk); + auto distances = raft::make_device_matrix(dev_resources, n_queries, topk); + auto neighbors_host = raft::make_host_matrix(n_queries, topk); + for (int64_t i = 0; i < n_queries; ++i) { + for (int64_t j = 0; j < topk; ++j) { + neighbors_host(i, j) = static_cast(indices_hnsw_host(i, j)); + } + } + + raft::copy(neighbors.data_handle(), + neighbors_host.data_handle(), + n_queries * topk, + raft::resource::get_cuda_stream(dev_resources)); + raft::copy(distances.data_handle(), + distances_hnsw_host.data_handle(), + n_queries * topk, + raft::resource::get_cuda_stream(dev_resources)); + raft::resource::sync_stream(dev_resources); + + print_results(dev_resources, neighbors.view(), distances.view()); +} + +} // namespace + +int main() +{ + raft::device_resources dev_resources; + + // Surface the per-phase build/materialize timing logs (RAFT_LOG_INFO). + raft::default_logger().set_level(rapids_logger::level_enum::info); + + rmm::mr::pool_memory_resource pool_mr(rmm::mr::get_current_device_resource_ref(), + 1024 * 1024 * 1024ull); + rmm::mr::set_current_device_resource(pool_mr); + +#if HNSW_ACE_LAYERED_USE_QUANTIZATION + std::cout << "[stage 1] Generate and quantize dataset (float -> int8)" << std::endl; +#else + std::cout << "[stage 1] Generate dataset (float)" << std::endl; +#endif + + int64_t n_samples = 10000; + int64_t n_dim = 90; + int64_t n_queries = 10; + auto dataset = raft::make_device_matrix(dev_resources, n_samples, n_dim); + auto queries = raft::make_device_matrix(dev_resources, n_queries, n_dim); + generate_dataset(dev_resources, dataset.view(), queries.view()); + + auto dataset_host = raft::make_host_matrix(n_samples, n_dim); + auto queries_host = raft::make_host_matrix(n_queries, n_dim); + raft::copy(dataset_host.data_handle(), + dataset.data_handle(), + dataset.extent(0) * dataset.extent(1), + raft::resource::get_cuda_stream(dev_resources)); + raft::copy(queries_host.data_handle(), + queries.data_handle(), + queries.extent(0) * queries.extent(1), + raft::resource::get_cuda_stream(dev_resources)); + raft::resource::sync_stream(dev_resources); + + auto dataset_host_view = raft::make_host_matrix_view( + dataset_host.data_handle(), n_samples, n_dim); + auto queries_host_view = raft::make_host_matrix_view( + queries_host.data_handle(), n_queries, n_dim); + + std::filesystem::create_directories(kBuildDir); + +#if HNSW_ACE_LAYERED_USE_QUANTIZATION + auto q = quantize_dataset(dev_resources, dataset_host_view, queries_host_view); + auto dataset_i8_view = raft::make_host_matrix_view( + q.dataset.data_handle(), n_samples, n_dim); + auto queries_i8_view = raft::make_host_matrix_view( + q.queries.data_handle(), n_queries, n_dim); + auto dataset_path = write_local_dataset(dataset_i8_view, std::string{kBuildDir} + "/dataset.npy"); + auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path); + + const std::string native_index_path = std::string{kBuildDir} + "/hnsw_native.bin"; + + std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl; + auto artifact_path = hnsw_build(dev_resources, hnsw_params, dataset_i8_view); + + std::cout << "[stage 3] Materialize layered HNSW -> native hnswlib index" << std::endl; + hnsw_materialize( + dev_resources, hnsw_params, artifact_path, dataset_path, n_dim, native_index_path); + + std::cout << "[stage 4] Read materialized hnswlib index into memory" << std::endl; + auto hnsw_index = + hnsw_load_native(dev_resources, native_index_path, hnsw_params.metric, n_dim); + + std::cout << "[stage 5] Search HNSW index" << std::endl; + hnsw_search(dev_resources, *hnsw_index, queries_i8_view); +#else + auto dataset_path = + write_local_dataset(dataset_host_view, std::string{kBuildDir} + "/dataset.npy"); + auto hnsw_params = make_hnsw_ace_params(kBuildDir, dataset_path); + + const std::string native_index_path = std::string{kBuildDir} + "/hnsw_native.bin"; + + std::cout << "[stage 2] Build layered HNSW index with ACE" << std::endl; + auto artifact_path = hnsw_build(dev_resources, hnsw_params, dataset_host_view); + + std::cout << "[stage 3] Materialize layered HNSW -> native hnswlib index" << std::endl; + hnsw_materialize( + dev_resources, hnsw_params, artifact_path, dataset_path, n_dim, native_index_path); + + std::cout << "[stage 4] Read materialized hnswlib index into memory" << std::endl; + auto hnsw_index = + hnsw_load_native(dev_resources, native_index_path, hnsw_params.metric, n_dim); + + std::cout << "[stage 5] Search HNSW index" << std::endl; + hnsw_search(dev_resources, *hnsw_index, queries_host_view); +#endif + + return 0; +} diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java index 3eef491b62..14d897521e 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java @@ -82,6 +82,44 @@ static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParams, CuVS return CuVSProvider.provider().hnswIndexBuild(resources, hnswParams, dataset); } + /** + * Materializes a layered HNSW artifact into a standard hnswlib index file on + * disk. + * + * Materializes a {@code GPU_LAYERED_ON_DISK} artifact (graph topology only, + * stored in ACE order) plus a local dataset into a standard hnswlib index file, + * without ever holding the full materialized index in host memory. The + * resulting file is compatible with the original hnswlib library and can be read + * back with {@code hierarchy == CPU}. The element data type is read from the + * artifact header. + * + * @param resources The CuVS resources + * @param materializeParams Materialization parameters (dataset path, host-memory + * budget, threads) + * @param layeredArtifactPath Path to the layered HNSW artifact + * @param outputPath Path to the hnswlib index file to write + * @param dim The dimension of the vectors in the index + * @param metric The distance metric used to build the index + * @throws Throwable if an error occurs during materialization + */ + static void materializeToHnswlib( + CuVSResources resources, + HnswMaterializeParams materializeParams, + String layeredArtifactPath, + String outputPath, + int dim, + HnswIndexParams.CuvsDistanceType metric) + throws Throwable { + Objects.requireNonNull(resources); + Objects.requireNonNull(materializeParams); + Objects.requireNonNull(layeredArtifactPath); + Objects.requireNonNull(outputPath); + Objects.requireNonNull(metric); + CuVSProvider.provider() + .hnswMaterializeToHnswlib( + resources, materializeParams, layeredArtifactPath, outputPath, dim, metric); + } + /** * Builder helps configure and create an instance of {@link HnswIndex}. */ diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java index 070cbedae1..6de60ed4f1 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java @@ -46,7 +46,16 @@ public enum CuvsHnswHierarchy { /** * Full hierarchy is built using the GPU */ - GPU(2); + GPU(2), + + /** + * GPU-built hierarchy stored as a layered on-disk topology artifact. + * + * The artifact stores graph topology only. When loading such an artifact, the + * {@code datasetPath} must point to the original-ID-ordered vectors used to + * reconstruct an in-memory HNSW index. + */ + GPU_LAYERED_ON_DISK(3); /** * The value for the enum choice. @@ -65,6 +74,8 @@ public static class HnswHierarchy { public static final CuvsHnswHierarchy NONE = CuvsHnswHierarchy.NONE; public static final CuvsHnswHierarchy CPU = CuvsHnswHierarchy.CPU; public static final CuvsHnswHierarchy GPU = CuvsHnswHierarchy.GPU; + public static final CuvsHnswHierarchy GPU_LAYERED_ON_DISK = + CuvsHnswHierarchy.GPU_LAYERED_ON_DISK; } private CuvsHnswHierarchy hierarchy = CuvsHnswHierarchy.GPU; @@ -74,6 +85,7 @@ public static class HnswHierarchy { private long m = 32; private CuvsDistanceType metric = CuvsDistanceType.L2Expanded; private HnswAceParams aceParams; + private String datasetPath; private HnswIndexParams( CuvsHnswHierarchy hierarchy, @@ -82,7 +94,8 @@ private HnswIndexParams( int vectorDimension, long m, CuvsDistanceType metric, - HnswAceParams aceParams) { + HnswAceParams aceParams, + String datasetPath) { this.hierarchy = hierarchy; this.efConstruction = efConstruction; this.numThreads = numThreads; @@ -90,6 +103,7 @@ private HnswIndexParams( this.m = m; this.metric = metric; this.aceParams = aceParams; + this.datasetPath = datasetPath; } /** @@ -153,6 +167,20 @@ public HnswAceParams getAceParams() { return aceParams; } + /** + * Gets the local dataset path used by layered HNSW deserialization. + * + * Required when {@code hierarchy == GPU_LAYERED_ON_DISK}: the artifact stores + * graph topology only, and loading reads the original-ID-ordered vectors from + * this path to reconstruct an in-memory HNSW index. Ignored for all other + * hierarchies. + * + * @return the dataset path, or null if not set + */ + public String getDatasetPath() { + return datasetPath; + } + @Override public String toString() { return "HnswIndexParams [hierarchy=" @@ -169,6 +197,8 @@ public String toString() { + metric + ", aceParams=" + aceParams + + ", datasetPath=" + + datasetPath + "]"; } @@ -184,6 +214,7 @@ public static class Builder { private long m = 32; private CuvsDistanceType metric = CuvsDistanceType.L2Expanded; private HnswAceParams aceParams; + private String datasetPath; /** * Constructs this Builder with an instance of Arena. @@ -276,6 +307,22 @@ public Builder withAceParams(HnswAceParams aceParams) { return this; } + /** + * Sets the local dataset path used by layered HNSW deserialization. + * + * Required when {@code hierarchy == GPU_LAYERED_ON_DISK}: the artifact stores + * graph topology only, and loading reads the original-ID-ordered vectors from + * this path to reconstruct an in-memory HNSW index. Ignored for all other + * hierarchies. + * + * @param datasetPath the local dataset path + * @return an instance of Builder + */ + public Builder withDatasetPath(String datasetPath) { + this.datasetPath = datasetPath; + return this; + } + /** * Builds an instance of {@link HnswIndexParams}. * @@ -289,7 +336,8 @@ public HnswIndexParams build() { vectorDimension, m, metric, - aceParams); + aceParams, + datasetPath); } } } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java new file mode 100644 index 0000000000..efbcc8acc1 --- /dev/null +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswMaterializeParams.java @@ -0,0 +1,134 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.nvidia.cuvs; + +/** + * Parameters for materializing a layered HNSW artifact into a standard hnswlib + * index file on disk. + * + * @since 26.08 + */ +public class HnswMaterializeParams { + + private final String datasetPath; + private final double maxHostMemoryGb; + private final int numThreads; + + private HnswMaterializeParams(String datasetPath, double maxHostMemoryGb, int numThreads) { + this.datasetPath = datasetPath; + this.maxHostMemoryGb = maxHostMemoryGb; + this.numThreads = numThreads; + } + + /** + * Gets the local dataset path holding the original-ID-ordered vectors used to + * build the artifact. + * + * @return the dataset path, or null if not set + */ + public String getDatasetPath() { + return datasetPath; + } + + /** + * Gets the upper bound on host memory (in GiB) used for the base-topology + * reorder buffer. When {@code <= 0}, the whole base topology is reordered in a + * single in-memory pass. + * + * @return the max host memory in GiB (0 means a single in-memory pass) + */ + public double getMaxHostMemoryGb() { + return maxHostMemoryGb; + } + + /** + * Gets the number of host threads to use. When 0, the maximum number of + * threads is used. + * + * @return the number of threads + */ + public int getNumThreads() { + return numThreads; + } + + @Override + public String toString() { + return "HnswMaterializeParams [datasetPath=" + + datasetPath + + ", maxHostMemoryGb=" + + maxHostMemoryGb + + ", numThreads=" + + numThreads + + "]"; + } + + /** + * Builder configures and creates an instance of {@link HnswMaterializeParams}. + */ + public static class Builder { + + private String datasetPath; + private double maxHostMemoryGb = 0; + private int numThreads = 0; + + /** + * Constructs this Builder. + */ + public Builder() {} + + /** + * Sets the local dataset path holding the original-ID-ordered vectors used to + * build the artifact. Supported formats match layered deserialization: + * {@code .npy} and ANN benchmark {@code *.bin} files with a + * {@code [uint32 rows, uint32 cols]} header ({@code .fbin}, {@code .f16bin}, + * {@code .u8bin}, {@code .i8bin}). + * + * @param datasetPath the local dataset path + * @return an instance of Builder + */ + public Builder withDatasetPath(String datasetPath) { + this.datasetPath = datasetPath; + return this; + } + + /** + * Sets the upper bound on host memory (in GiB) used for the base-topology + * reorder buffer. + * + * When {@code <= 0} (default), the whole base topology is reordered in a single + * in-memory pass (no temporary files). When set, the base topology is reordered + * through bucketed temporary files so that peak host memory stays close to this + * budget. + * + * @param maxHostMemoryGb the max host memory in GiB + * @return an instance of Builder + */ + public Builder withMaxHostMemoryGb(double maxHostMemoryGb) { + this.maxHostMemoryGb = maxHostMemoryGb; + return this; + } + + /** + * Sets the number of host threads to use. When 0 (default), the maximum number + * of threads is used. + * + * @param numThreads the number of threads + * @return an instance of Builder + */ + public Builder withNumThreads(int numThreads) { + this.numThreads = numThreads; + return this; + } + + /** + * Builds an instance of {@link HnswMaterializeParams}. + * + * @return an instance of {@link HnswMaterializeParams} + */ + public HnswMaterializeParams build() { + return new HnswMaterializeParams(datasetPath, maxHostMemoryGb, numThreads); + } + } +} diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java index c39578755c..9ba6a34475 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java @@ -135,6 +135,28 @@ HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable; + /** + * Materializes a layered HNSW artifact into a standard hnswlib index file on disk. + * + * @param resources The CuVS resources + * @param materializeParams Materialization parameters (dataset path, host-memory budget, threads) + * @param layeredArtifactPath Path to the layered HNSW artifact + * @param outputPath Path to the hnswlib index file to write + * @param dim The dimension of the vectors in the index + * @param metric The distance metric used to build the index + * @throws Throwable if an error occurs during materialization + */ + default void hnswMaterializeToHnswlib( + CuVSResources resources, + HnswMaterializeParams materializeParams, + String layeredArtifactPath, + String outputPath, + int dim, + HnswIndexParams.CuvsDistanceType metric) + throws Throwable { + throw new UnsupportedOperationException(); + } + /** Creates a new TieredIndex Builder. */ TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException; diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java index 950504bc5a..573699a39d 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java @@ -160,6 +160,27 @@ public void close() { } } + static CloseableHandle createHnswMaterializeParamsNative() { + try (var localArena = Arena.ofConfined()) { + var paramsPtrPtr = localArena.allocate(cuvsHnswMaterializeParams_t); + checkCuVSError( + cuvsHnswMaterializeParamsCreate(paramsPtrPtr), "cuvsHnswMaterializeParamsCreate"); + var paramsPtr = paramsPtrPtr.get(cuvsHnswMaterializeParams_t, 0L); + return new CloseableHandle() { + @Override + public MemorySegment handle() { + return paramsPtr; + } + + @Override + public void close() { + checkCuVSError( + cuvsHnswMaterializeParamsDestroy(paramsPtr), "cuvsHnswMaterializeParamsDestroy"); + } + }; + } + } + static CloseableHandle createTieredIndexParams() { try (var localArena = Arena.ofConfined()) { var paramsPtrPtr = localArena.allocate(cuvsTieredIndexParams_t); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java index ca528ac010..c6e241180b 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java @@ -6,6 +6,7 @@ import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswAceParamsNative; import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswIndexParams; +import static com.nvidia.cuvs.internal.CuVSParamsHelper.createHnswMaterializeParamsNative; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG; import static com.nvidia.cuvs.internal.common.Util.buildMemorySegment; @@ -19,6 +20,7 @@ import com.nvidia.cuvs.HnswAceParams; import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.HnswIndexParams; +import com.nvidia.cuvs.HnswMaterializeParams; import com.nvidia.cuvs.HnswQuery; import com.nvidia.cuvs.HnswSearchParams; import com.nvidia.cuvs.SearchResults; @@ -27,6 +29,7 @@ import com.nvidia.cuvs.internal.panama.cuvsHnswAceParams; import com.nvidia.cuvs.internal.panama.cuvsHnswIndex; import com.nvidia.cuvs.internal.panama.cuvsHnswIndexParams; +import com.nvidia.cuvs.internal.panama.cuvsHnswMaterializeParams; import com.nvidia.cuvs.internal.panama.cuvsHnswSearchParams; import java.io.InputStream; import java.lang.foreign.Arena; @@ -197,7 +200,14 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable { cuvsHnswIndex.dtype(indexReference.memorySegment, dtype); - try (var params = segmentFromIndexParams(hnswIndexParams); + // The metric only affects loading for the layered (GPU_LAYERED_ON_DISK) hierarchy, where the + // native side validates it against the artifact header. For the CPU/GPU/NONE hnswlib paths + // the metric is otherwise unused at load time. + int deserializeMetric = + hnswIndexParams.getHierarchy() == HnswIndexParams.CuvsHnswHierarchy.GPU_LAYERED_ON_DISK + ? hnswIndexParams.getMetric().value + : 0; + try (var params = segmentFromIndexParams(localArena, hnswIndexParams); var cuvsResourcesAccessor = resources.access()) { checkCuVSError( cuvsHnswDeserialize( @@ -205,7 +215,7 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable { params.handle(), pathSeg, hnswIndexParams.getVectorDimension(), - 0, + deserializeMetric, indexReference.memorySegment), "cuvsHnswDeserialize"); } @@ -218,13 +228,21 @@ private IndexReference deserialize(InputStream inputStream) throws Throwable { } /** - * Allocates the configured search parameters in the MemorySegment. + * Allocates the configured index parameters in the MemorySegment. + * + * The {@code dataset_path} string (when set) is allocated from {@code arena}, so + * the caller must keep {@code arena} alive until the native call that consumes + * the params has returned. */ - private CloseableHandle segmentFromIndexParams(HnswIndexParams params) { + private CloseableHandle segmentFromIndexParams(Arena arena, HnswIndexParams params) { var hnswParams = createHnswIndexParams(); cuvsHnswIndexParams.hierarchy(hnswParams.handle(), params.getHierarchy().value); cuvsHnswIndexParams.ef_construction(hnswParams.handle(), params.getEfConstruction()); cuvsHnswIndexParams.num_threads(hnswParams.handle(), params.getNumThreads()); + String datasetPath = params.getDatasetPath(); + if (datasetPath != null) { + cuvsHnswIndexParams.dataset_path(hnswParams.handle(), arena.allocateFrom(datasetPath)); + } return hnswParams; } @@ -288,6 +306,56 @@ public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParam return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams); } + /** + * Materializes a layered HNSW artifact into a standard hnswlib index file on disk. + * + * @param resources the CuVS resources + * @param materializeParams the materialization parameters + * @param layeredArtifactPath path to the layered HNSW artifact + * @param outputPath path to the hnswlib index file to write + * @param dim the dimension of the vectors in the index + * @param metric the distance metric used to build the index + * @throws Throwable if an error occurs during materialization + */ + public static void materializeToHnswlib( + CuVSResources resources, + HnswMaterializeParams materializeParams, + String layeredArtifactPath, + String outputPath, + int dim, + HnswIndexParams.CuvsDistanceType metric) + throws Throwable { + Objects.requireNonNull(resources); + Objects.requireNonNull(materializeParams); + Objects.requireNonNull(layeredArtifactPath); + Objects.requireNonNull(outputPath); + Objects.requireNonNull(metric); + + try (var localArena = Arena.ofConfined(); + var paramsHandle = createHnswMaterializeParamsNative()) { + MemorySegment paramsSeg = paramsHandle.handle(); + + String datasetPath = materializeParams.getDatasetPath(); + if (datasetPath != null) { + cuvsHnswMaterializeParams.dataset_path(paramsSeg, localArena.allocateFrom(datasetPath)); + } + cuvsHnswMaterializeParams.max_host_memory_gb( + paramsSeg, materializeParams.getMaxHostMemoryGb()); + cuvsHnswMaterializeParams.num_threads(paramsSeg, materializeParams.getNumThreads()); + + MemorySegment artifactSeg = buildMemorySegment(localArena, layeredArtifactPath); + MemorySegment outputSeg = buildMemorySegment(localArena, outputPath); + + try (var resourcesAccessor = resources.access()) { + var cuvsRes = resourcesAccessor.handle(); + int returnValue = + cuvsHnswMaterializeToHnswlib( + cuvsRes, paramsSeg, artifactSeg, outputSeg, dim, metric.value); + checkCuVSError(returnValue, "cuvsHnswMaterializeToHnswlib"); + } + } + } + private static CloseableHandle createHnswIndexParamsForBuild(Arena arena, HnswIndexParams params) { var hnswParams = createHnswIndexParams(); MemorySegment seg = hnswParams.handle(); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java index 1d3199f26f..b7b03c8ce5 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java @@ -260,6 +260,19 @@ public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswPar return HnswIndexImpl.build(resources, hnswParams, dataset); } + @Override + public void hnswMaterializeToHnswlib( + CuVSResources resources, + HnswMaterializeParams materializeParams, + String layeredArtifactPath, + String outputPath, + int dim, + HnswIndexParams.CuvsDistanceType metric) + throws Throwable { + HnswIndexImpl.materializeToHnswlib( + resources, materializeParams, layeredArtifactPath, outputPath, dim, metric); + } + @Override public TieredIndex.Builder newTieredIndexBuilder(CuVSResources cuVSResources) { return TieredIndexImpl.newBuilder(Objects.requireNonNull(cuVSResources)); diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py index f91835b7c5..3e8541b603 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py +++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py @@ -7,11 +7,13 @@ ExtendParams, Index, IndexParams, + MaterializeParams, SearchParams, build, extend, from_cagra, load, + materialize_to_hnswlib, save, search, ) @@ -21,10 +23,12 @@ "IndexParams", "Index", "ExtendParams", + "MaterializeParams", "build", "extend", "SearchParams", "load", + "materialize_to_hnswlib", "save", "search", "from_cagra", diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd index 9ffb295ad3..1dff7a0809 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd @@ -20,6 +20,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: NONE CPU GPU + GPU_LAYERED_ON_DISK ctypedef struct cuvsHnswAceParams: size_t npartitions @@ -41,6 +42,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: size_t M cuvsDistanceType metric cuvsHnswAceParams_t ace_params + const char* dataset_path ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t @@ -105,3 +107,24 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: int32_t dim, cuvsDistanceType metric, cuvsHnswIndex_t index) except + + + ctypedef struct cuvsHnswMaterializeParams: + const char* dataset_path + double max_host_memory_gb + int num_threads + + ctypedef cuvsHnswMaterializeParams* cuvsHnswMaterializeParams_t + + cuvsError_t cuvsHnswMaterializeParamsCreate( + cuvsHnswMaterializeParams_t* params) + + cuvsError_t cuvsHnswMaterializeParamsDestroy( + cuvsHnswMaterializeParams_t params) + + cuvsError_t cuvsHnswMaterializeToHnswlib( + cuvsResources_t res, + cuvsHnswMaterializeParams_t params, + const char* layered_artifact_path, + const char* output_path, + int32_t dim, + cuvsDistanceType metric) except + diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx index 6757695de3..bff11001d0 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx @@ -124,10 +124,15 @@ cdef class IndexParams: Parameters ---------- hierarchy : string, default = "gpu" (optional) - The hierarchy of the HNSW index. Valid values are ["none", "cpu", "gpu"]. + The hierarchy of the HNSW index. Valid values are + ["none", "cpu", "gpu", "gpu_layered_on_disk"]. - "none": No hierarchy is built. - "cpu": Hierarchy is built using CPU. - "gpu": Hierarchy is built using GPU. + - "gpu_layered_on_disk": The index artifact stores graph topology only + (built on the GPU and stored as a layered on-disk artifact). When + loading such an artifact with `load()`, `dataset_path` must point to + the original-ID-ordered vectors used to reconstruct the index. ef_construction : int, default = 200 (optional) Maximum number of candidate list size used during construction when hierarchy is `cpu`. @@ -148,14 +153,22 @@ cdef class IndexParams: ace_params : AceParams, default = None (optional) ACE parameters for building HNSW index using ACE algorithm. If set, enables the build() function to use ACE for index construction. + dataset_path : string, default = None (optional) + Local dataset path used by layered HNSW deserialization. Required when + `hierarchy == "gpu_layered_on_disk"`: the artifact stores graph + topology only, and `load()` reads the original-ID-ordered vectors from + this path to reconstruct an in-memory HNSW index. Ignored for all other + hierarchies. """ cdef cuvsHnswIndexParams* params cdef AceParams _ace_params + cdef object _dataset_path_bytes def __cinit__(self): check_cuvs(cuvsHnswIndexParamsCreate(&self.params)) self._ace_params = None + self._dataset_path_bytes = None def __dealloc__(self): check_cuvs(cuvsHnswIndexParamsDestroy(self.params)) @@ -166,16 +179,20 @@ cdef class IndexParams: num_threads=0, M=32, metric="sqeuclidean", - ace_params=None): + ace_params=None, + dataset_path=None): if hierarchy == "none": self.params.hierarchy = cuvsHnswHierarchy.NONE elif hierarchy == "cpu": self.params.hierarchy = cuvsHnswHierarchy.CPU elif hierarchy == "gpu": self.params.hierarchy = cuvsHnswHierarchy.GPU + elif hierarchy == "gpu_layered_on_disk": + self.params.hierarchy = cuvsHnswHierarchy.GPU_LAYERED_ON_DISK else: raise ValueError("Invalid hierarchy type." - " Valid values are 'none', 'cpu', and 'gpu'.") + " Valid values are 'none', 'cpu', 'gpu', and" + " 'gpu_layered_on_disk'.") self.params.ef_construction = ef_construction self.params.num_threads = num_threads self.params.M = M @@ -189,6 +206,12 @@ cdef class IndexParams: else: self.params.ace_params = NULL + if dataset_path is not None: + self._dataset_path_bytes = dataset_path.encode('utf-8') + self.params.dataset_path = self._dataset_path_bytes + else: + self.params.dataset_path = NULL + @property def hierarchy(self): if self.params.hierarchy == cuvsHnswHierarchy.NONE: @@ -197,6 +220,8 @@ cdef class IndexParams: return "cpu" elif self.params.hierarchy == cuvsHnswHierarchy.GPU: return "gpu" + elif self.params.hierarchy == cuvsHnswHierarchy.GPU_LAYERED_ON_DISK: + return "gpu_layered_on_disk" @property def ef_construction(self): @@ -214,6 +239,12 @@ cdef class IndexParams: def ace_params(self): return self._ace_params + @property + def dataset_path(self): + if self.params.dataset_path is not NULL: + return self.params.dataset_path.decode('utf-8') + return None + cdef class Index: """ @@ -270,6 +301,68 @@ cdef class ExtendParams: return self.params.num_threads +cdef class MaterializeParams: + """ + Parameters for materializing a layered HNSW artifact into an hnswlib + index on disk. + + Parameters + ---------- + dataset_path : string, default = None (optional) + Local dataset path holding the original-ID-ordered vectors used to + build the artifact. Supported formats match layered deserialization: + `.npy` and ANN benchmark `*.bin` files with a + `[uint32 rows, uint32 cols]` header (`.fbin`, `.f16bin`, `.u8bin`, + `.i8bin`). + max_host_memory_gb : float, default = 0 (optional) + Upper bound on host memory (in GiB) used for the base-topology reorder + buffer. When <= 0, the whole base topology is reordered in a single + in-memory pass (no temporary files). When set, the base topology is + reordered through bucketed temporary files so that peak host memory + stays close to this budget. + num_threads : int, default = 0 (optional) + Number of host threads to use. When 0, the maximum number of threads + is used. + """ + + cdef cuvsHnswMaterializeParams* params + cdef object _dataset_path_bytes + + def __cinit__(self): + check_cuvs(cuvsHnswMaterializeParamsCreate(&self.params)) + self._dataset_path_bytes = None + + def __dealloc__(self): + if self.params is not NULL: + check_cuvs(cuvsHnswMaterializeParamsDestroy(self.params)) + + def __init__(self, *, + dataset_path=None, + max_host_memory_gb=0, + num_threads=0): + if dataset_path is not None: + self._dataset_path_bytes = dataset_path.encode('utf-8') + self.params.dataset_path = self._dataset_path_bytes + else: + self.params.dataset_path = NULL + self.params.max_host_memory_gb = max_host_memory_gb + self.params.num_threads = num_threads + + @property + def dataset_path(self): + if self.params.dataset_path is not NULL: + return self.params.dataset_path.decode('utf-8') + return None + + @property + def max_host_memory_gb(self): + return self.params.max_host_memory_gb + + @property + def num_threads(self): + return self.params.num_threads + + @auto_sync_resources def save(filename, Index index, resources=None): """ @@ -406,6 +499,84 @@ def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean", return idx +@auto_sync_resources +def materialize_to_hnswlib(MaterializeParams materialize_params, + layered_artifact_path, + output_path, + dim, + metric="sqeuclidean", + resources=None): + """ + Materialize a layered HNSW artifact into a standard hnswlib index file + on disk. + + Materializes a `gpu_layered_on_disk` artifact (graph topology only, stored + in ACE order) plus a local dataset into a standard hnswlib index file, + without ever holding the full materialized index in host memory. The + resulting file is compatible with the original hnswlib library and can be + read back through `load()` with `hierarchy="cpu"`. The element data type + (float32, float16, uint8, int8) is read from the artifact header. + + Parameters + ---------- + materialize_params : MaterializeParams + Materialization parameters. `dataset_path` must point to the + original-ID-ordered vectors used to build the artifact. + layered_artifact_path : string + Path to the layered HNSW artifact. + output_path : string + Path to the hnswlib index file to write. + dim : int + Dimensions of the training dataset. + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - inner_product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + {resources_docstring} + + Examples + -------- + >>> import numpy as np + >>> from cuvs.neighbors import hnsw + >>> n_features = 50 + >>> # Assume a layered artifact was produced by an ACE GPU build and the + >>> # original-ID-ordered vectors are stored in "dataset.fbin". + >>> materialize_params = hnsw.MaterializeParams( + ... dataset_path="dataset.fbin" + ... ) + >>> hnsw.materialize_to_hnswlib( + ... materialize_params, + ... "layered_artifact.cuvs", + ... "index.bin", + ... n_features, + ... metric="sqeuclidean", + ... ) + >>> # The materialized index can be loaded as a standard hnswlib index. + >>> index = hnsw.load( + ... hnsw.IndexParams(hierarchy="cpu"), + ... "index.bin", + ... n_features, + ... np.float32, + ... "sqeuclidean", + ... ) + """ + cdef string c_artifact = layered_artifact_path.encode('utf-8') + cdef string c_output = output_path.encode('utf-8') + cdef cuvsDistanceType distance_type = DISTANCE_TYPES[metric] + cdef cuvsResources_t res = resources.get_c_obj() + + check_cuvs(cuvsHnswMaterializeToHnswlib( + res, + materialize_params.params, + c_artifact.c_str(), + c_output.c_str(), + dim, + distance_type + )) + + @auto_sync_resources def from_cagra(IndexParams index_params, cagra.Index cagra_index, temporary_index_path=None, resources=None):