diff --git a/README.md b/README.md index 7572dc1c1..fdd7ca0a0 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJylVgtv2zYQ_isHDUXtVJYfaFDEjQN4bYoZK5whTlsUcaHQFG0TkUmNpOx6hv_77ijJlpsM67AWiCHe-7vvjtwFVlgrtbJB_34XyCTod8MgZWqRs4UI-gHPExaEgdW54fTdPpsqOIN3OtsauVg6aPAm9Dq9bgv_vA5h_Hn0fjSEdze3f9zcDu9GN-OIDLzRR8mFsiKBXCXCgFsKGGaM408pCeGzMJQN9KIONEhhGpSyadB8671sdQ4rtgWlHeRWoBtpYS5TAeI7F5kDqYDrVZZKpriAjXRLH6r049OBr6UTPXMM9RlaZPg1r2sCc4fU6d_Suazfbm82m4j5tCNtFu20ULbtj6N31-PJdQtTP5h9UinCC0b8mUuDhc-2wDLMjLMZ5puyDWgDbGEEypymzDdGOqkWIVg9dxtmhPeTSOuMnOXuBLwqT6y_roDwMYXADScwmkwD-HU4GU1C7-fL6O63m0938GV4ezsc342uJ3Bzi80avx9Rq_DrAwzHX-H30fh9CAKhw1Die2aoCkxVEqwiKTCcCHGSxlwXadlMcDmXHCoawUKvhVFYFmTCrGRBOEwy8X5SuZKOOX_2pDgfqj1VU_WLVDzNEwGXPOe6bcmEx6s8dXLFsojny6tTNbc0uXXtRKzRVbwW3GkTkdITFemEYShtc50rgj-uTp7Xt9hPgeR6XuoMUxbBWEU_ZuQrtf5QKocElKqx1jJpTtUO60Q6E0aPYhu7bSaQcgOkhHt7FK1ZmotCWIlIeLAQq8xtY_q0ggoRqWfuAFpd76VmX6gWBwflUpFUObbDUevBuqTft_IvNIQxqpx3XnY6nVKt3cZdgJom5w5bClU_irnrdjoh6oJNNUqLGlpd6rz_sb7TPhOqoe2zKb1WOdkIxtoVTOM0dxKVkfF8qVEFHpXekNcNTXqaIihWGIc5-rjo1IalQ6InzkWqWQJzRmSgeT_vvIh8ucgprPKUVJcVsGENuSvsW-bbBQjHGfTC0vgA_u5pG_anSt7b7rkW7PclsAWZ-v0T-l4WTjImzfO5XQHJbGPcrPXHCIYAMqhoS3V7tQiuBS4yggmW1I2NxuIcrjYblcbX0SIKYbdDPLGGXTfs4k8URYAnvXMiQnEM_qMU7usFHMahUWBWna_Yo4ifDNwlsvqq0WmGP69clEIEbTQru-JsJhY4YdXZ_TeI4xLOuMFyWrhN2OF2drlR8G_Q7iS8gMYY2tBrhiD3b2FfA3nkeYcTgBz0wWk8tWct0oWUaEkV7GycZlcmK1SC-R89TurdMnomPJsJdyC4iy68vnh5cXGx_2fCVKVceevY6XguMZDnx9GmYkajrnRM8OTU5xlCp84wakwxzJrz3BjyZSlxnzJea_eYdbE3mrRZ6NLFmwoFHqBqh5fu7vDsIdZ4mZkHsPl8Lr-jaoK3phMUhbkfQlEkhg8C1fLspfXQ9c58k_0mA3SX5S4mmuAO8zcGZV2E-em6m_9zOpMYr1Is1zZq-dSQ_ICRChI9bfbe702LgRAFZq3mktF9f9icdeI9HCI9VAwljGjSH-pl3ctvD5BoYdVLhzc9PiPCmi1KIysQwAQGg2evjIcDzGVBgnQ9vjhXRoq1-I8Qh0eQKrWCrD7KwWnZyFrU1lPDI7DEKacdS0Hlq5nwu9_nVS08sEudp_hEQ-6N4RX4QS-qs35o8b3ZOCHR4MdkBnAwPBClXC50XeKU4kuanqb4WDXHB3eg1px3e-d5F8U6c8VrPGjhvTvgr15130CLGb4c2FX8pgOtFl7KDv84rFkkrZStZv6JnspZzSfnPMXDdfGexgO8odVjsA8rOVL1RI5MDvbf_P-_AYKkJA4=)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVVg1v2zYQ_Ss3DUXlVJY_sKyI4wTwUg8zVjhDnLYo6kKgKdomIpEqSdn1DP_3HUnJkRcD61oghu6Ox3fvHo_cB5ppzaXQweDLPuBpMOhFQUbEqiQrFgwCWqYkiAItS0Xtd-diLuAC7mSxU3y1NhDSFvS7_V4b__wawfTj5N1kBHf3D3_dP4weJ_fT2C5wi95zyoRmKZQiZQrMmsGoIBR_Kk8EH5myaKAfdyG0AfOg8s2D1rXLspMl5GQHQhooNcM0XMOSZwzYd8oKA1wAlXmRcSIogy03a7dVlcfBgc9VErkwBOMJrijwa9mMBGKO0O2_tTHFoNPZbrcxcbBjqVadzAfrzvvJ3Xg6G7cR-nHZB5EhvaDYt5IrLHyxA1IgMkoWiDcjW5AKyEox9BlpkW8VN1ysItByabZEMZcn5doovijNCXk1Tqy_GYD0EYHEjWYwmc0D-G00m8wil-fT5PGP-w-P8Gn08DCaPk7GM7h_wGZN301sq_DrdxhNP8Ofk-m7CBhSh1ux74WyVSBUbmllqedwxtgJjKX0sHTBKF9yCrWMYCU3TAksCwqmcu4FhyBTlyfjOTfEONuL4txWnbmYi5-5oFmZMhjSksqOtktokpeZ4TkpYlqub0_DzFqV2nSoLIWJrfOFK2Ub3CXZMGqkOh_CDVMEvT4NlpDUlvPxGlvNUHfnvUYRoZGnPP43WC6xgYzkzsyFQXVyEW4kT1tzsUcSUOuWwCe2S8yuYKjHG9SLuX52bUhWMu-sXdZ5XMHywuwS-6mZLYVlTtY30O65LI31PtQbjsFVoA2l2CtjdQHapIOB5n_jQphiyGX3dbfbrcI6HRwUGKlKarDfUDfLH8petxthLOhMotfX0O5ZWbgf7WTgkNgaOg5NlbXGpGOYSuNlSO2h5BiMx4GuJYbAk5Bbm3Vrx0CWISmaKYMY3b6YVEdVQqtdPDSZJCksiZWDHQaX3VexKxcFh1WeKm5YExs1mLvFvhWuXYB0XEA_qhYfyd-_bMPhNMhl259rweFQEevlNBicCHjokxSEq_PYbsH6dDhtNfqDokMCCdTCtXW7sBjGDKecpQnWthtbicUZnHs6rhaP41UcwX6PfGIN-17Uw584jgEt_UsrBG8G91E5D80Cjgci9JzV9pw8seTFkRuiqm_Dbiv68WBfihVo2KrXeduCrfCE1bYvXyFJKjqTkJR2Grdgj6PblErAf1G75_AKwil0oN-KgB-u4dAgeeJ0hycANeg2t8dTOtWiXGyQnWBeneEpugosEynif844a3ZLyQVzara8g6Xbd6H_y-urq6tD3ay6mXotyyzFnm4Y9OuW2rungee8wOrSb91uiZHJkiMwX_Z1c12tprAZ-FzUidXVFkG3qco1o0-YjBhfF96DdubYuxovuB-BupAyu31e08Tobgvv0P8D3zFXHdbAi88Wvty5_h7xLlGQ9oIDJyY3MUGUeeLsOClr3E644YvszQ0rBDhHWb0pX0LYyHYDvkCoZo8by5jZwHCIT4FZSSle4T_BCBHikwCNLt578dbl2C4P3SesKa5v1znuGlyfzT1uDok-pKV73-BMccdDR3624vPGkMzKtd7eb-3l_Zz-4Murjp29SNCCD1D7osM3nnp-pwZiQ2mvf1n20C0L4x-xQRvh3dA3b3pvoU0UXd_oPHnbhXYbryuDfwySydJ2RvKFe9lmfNHISSnN0Ljxz1A0IOfiKThEtR_nwIkfVRccvrr__wANsNIK)) ### `static_multiset` diff --git a/benchmarks/static_multimap/count_bench.cu b/benchmarks/static_multimap/count_bench.cu index 439a8e5c8..a605ba3c7 100644 --- a/benchmarks/static_multimap/count_bench.cu +++ b/benchmarks/static_multimap/count_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( state.add_element_count(num_keys); - cuco::experimental::static_multimap map{ + cuco::static_multimap map{ size, cuco::empty_key{-1}, cuco::empty_value{-1}}; map.insert(pairs.begin(), pairs.end()); diff --git a/benchmarks/static_multimap/insert_bench.cu b/benchmarks/static_multimap/insert_bench.cu index 0c5367c58..c9b54e51e 100644 --- a/benchmarks/static_multimap/insert_bench.cu +++ b/benchmarks/static_multimap/insert_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,15 +56,15 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - cuco::experimental::static_multimap map{size, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - {}, - {}, - {}, - {}, - {}, - {launch.get_stream()}}; + cuco::static_multimap map{size, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + {}, + {}, + {}, + {}, + {}, + {launch.get_stream()}}; timer.start(); map.insert(pairs.begin(), pairs.end(), {launch.get_stream()}); diff --git a/benchmarks/static_multimap/query_bench.cu b/benchmarks/static_multimap/query_bench.cu index 82fc7c63e..74849b742 100644 --- a/benchmarks/static_multimap/query_bench.cu +++ b/benchmarks/static_multimap/query_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,11 +62,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( map.insert(pairs.begin(), pairs.end()); auto const output_size = map.count(keys.begin(), keys.end()); - thrust::device_vector output(output_size); + thrust::device_vector output_probe(output_size); + thrust::device_vector output_match(output_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto const count = map.count(keys.begin(), keys.end(), launch.get_stream()); - map.retrieve(keys.begin(), keys.end(), output.begin(), launch.get_stream()); + auto const count = map.count(keys.begin(), keys.end(), {launch.get_stream()}); + map.retrieve( + keys.begin(), keys.end(), output_probe.begin(), output_match.begin(), {launch.get_stream()}); }); } diff --git a/benchmarks/static_multimap/retrieve_bench.cu b/benchmarks/static_multimap/retrieve_bench.cu index 75dc79bd8..594533574 100644 --- a/benchmarks/static_multimap/retrieve_bench.cu +++ b/benchmarks/static_multimap/retrieve_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,10 +62,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( map.insert(pairs.begin(), pairs.end()); auto const output_size = map.count(keys.begin(), keys.end()); - thrust::device_vector output(output_size); + thrust::device_vector output_probe(output_size); + thrust::device_vector output_match(output_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve(keys.begin(), keys.end(), output.begin(), launch.get_stream()); + map.retrieve( + keys.begin(), keys.end(), output_probe.begin(), output_match.begin(), {launch.get_stream()}); }); } diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu index 3a37bc191..c89cb306a 100644 --- a/examples/static_multimap/host_bulk_example.cu +++ b/examples/static_multimap/host_bulk_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +16,13 @@ #include +#include #include #include #include #include -#include +#include int main(void) { @@ -52,23 +53,22 @@ int main(void) // Inserts all pairs into the map map.insert(pairs.begin(), pairs.end()); - // Sequence of probe keys {0, 1, 2, ... 49'999} - thrust::device_vector keys_to_find(N); + // Sequence of probe keys {0, 1, 2, ... 24'999} + // Each key should have 2 matches in the map + thrust::device_vector keys_to_find(N / 2); thrust::sequence(keys_to_find.begin(), keys_to_find.end(), 0); - // Counts the occurrences of keys in [0, 50'000) contained in the multimap. - // The `_outer` suffix indicates that the occurrence of a non-match is 1. - auto const output_size = map.count_outer(keys_to_find.begin(), keys_to_find.end()); + // Check that keys are contained in the map + thrust::device_vector contained(N / 2); + map.contains(keys_to_find.begin(), keys_to_find.end(), contained.begin()); - thrust::device_vector> d_results(output_size); + // Verify all keys are found + auto const num_found = thrust::count(contained.begin(), contained.end(), true); - // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results` - // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel` - auto output_end = map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.begin()); - auto retrieve_size = output_end - d_results.begin(); - - // The total number of outer matches should be `N + N / 2` - assert(not(output_size == retrieve_size == N + N / 2)); + if (num_found == N / 2) { + std::cout << "Success! All " << N / 2 << " unique keys found in the multimap.\n"; + std::cout << "Each key has 2 duplicate values, for a total of " << N << " pairs.\n"; + } return 0; } diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh deleted file mode 100644 index 165b8e903..000000000 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -#include - -#include - -namespace cuco::legacy::detail { - -/** - * @brief Base class of public probe sequence. This class should not be used directly. - * - * @tparam CGSize Size of CUDA Cooperative Groups - */ -template -class probe_sequence_base { - protected: - /** - * @brief Returns the size of the CUDA cooperative thread group. - */ - static constexpr std::size_t cg_size = CGSize; - - /** - * @brief Returns the number of elements loaded with each vector load. - * - * @return The number of elements loaded with each vector load - */ - static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; } -}; - -/** - * @brief Base class of probe sequence implementation. - * - * Hash map operations are generally memory-bandwidth bound. A vector-load loads two consecutive - * slots instead of one to fully utilize the 16B memory load supported by SASS/hardware thus - * improve memory throughput. This method (flagged by `uses_vector_load` logic) is implicitly - * applied to all hash map operations (e.g. `insert`, `count`, and `retrieve`, etc.) when pairs - * are packable (see `cuco::detail::is_packable` logic). - * - * @tparam Key Type used for keys - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which multimap operations will be performed by - * individual threads - * @tparam VectorWidth Length of vector load - * @tparam CGSize Size of CUDA Cooperative Groups - */ -template -class probe_sequence_impl_base { - protected: - using value_type = cuco::pair; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using atomic_key_type = cuda::atomic; ///< Type of atomic keys - using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values - /// Pair type of atomic key and atomic mapped value - using pair_atomic_type = cuco::pair; - /// Type of the forward iterator to `pair_atomic_type` - using iterator = pair_atomic_type*; - /// Type of the forward iterator to `const pair_atomic_type` - using const_iterator = pair_atomic_type const*; - - /** - * @brief Returns the number of elements loaded with each vector-load. - */ - static constexpr uint32_t vector_width = VectorWidth; - - /** - * @brief Returns the size of the CUDA cooperative thread group. - */ - static constexpr std::size_t cg_size = CGSize; - - /** - * @brief Indicates if vector-load is used. - * - * Users have no explicit control on whether vector-load is used. - * - * @return Boolean indicating if vector-load is used. - */ - __host__ __device__ static constexpr bool uses_vector_load() noexcept - { - return cuco::detail::is_packable(); - } - - /** - * @brief Constructs a probe sequence based on the given hash map features. - * - * @param slots Pointer to beginning of the hash map slots - * @param capacity Capacity of the hash map - */ - __host__ __device__ explicit probe_sequence_impl_base(iterator slots, std::size_t capacity) - : slots_{slots}, capacity_{capacity} - { - } - - public: - /** - * @brief Returns the capacity of the hash map. - * - * @return The capacity of the hash map - */ - __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept - { - return capacity_; - } - - /** - * @brief Returns slots array. - * - * @return Slots array - */ - __device__ __forceinline__ iterator get_slots() noexcept { return slots_; } - - /** - * @brief Returns slots array. - * - * @return Slots array - */ - __device__ __forceinline__ const_iterator get_slots() const noexcept { return slots_; } - - protected: - iterator slots_; ///< Pointer to beginning of the hash map slots - const std::size_t capacity_; ///< Total number of slots -}; // class probe_sequence_impl_base - -/** - * @brief Cooperative Groups based Linear probing scheme. - * - * @tparam Key Type used for keys - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which multimap operations will be performed by - * individual threads - * @tparam VectorWidth Length of vector load - * @tparam CGSize Size of CUDA Cooperative Groups - * @tparam Hash Unary callable type - */ -template -class linear_probing_impl - : public probe_sequence_impl_base { - public: - using probe_sequence_impl_base_type = - probe_sequence_impl_base; - using value_type = typename probe_sequence_impl_base_type::value_type; - using key_type = typename probe_sequence_impl_base_type::key_type; - using mapped_type = typename probe_sequence_impl_base_type::mapped_type; - using atomic_key_type = typename probe_sequence_impl_base_type::atomic_key_type; - using atomic_mapped_type = typename probe_sequence_impl_base_type::atomic_mapped_type; - using pair_atomic_type = typename probe_sequence_impl_base_type::pair_atomic_type; - using iterator = typename probe_sequence_impl_base_type::iterator; - using const_iterator = typename probe_sequence_impl_base_type::const_iterator; - - using probe_sequence_impl_base_type::capacity_; - using probe_sequence_impl_base_type::cg_size; - using probe_sequence_impl_base_type::slots_; - using probe_sequence_impl_base_type::uses_vector_load; - using probe_sequence_impl_base_type::vector_width; - - /** - * @brief Constructs a linear probing scheme based on the given hash map features. - * - * @param slots Pointer to beginning of the hash map slots - * @param capacity Capacity of the hash map - * @param hash Unary function to hash each key - */ - __host__ __device__ explicit linear_probing_impl(iterator slots, std::size_t capacity) - : probe_sequence_impl_base_type{slots, capacity}, hash_{Hash{}} - { - } - - /** - * @brief Returns the initial slot for a given key `k`. - * - * If vector-load is enabled, the return slot is always even to avoid illegal memory access. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ iterator initial_slot( - cooperative_groups::thread_block_tile g, ProbeKey const& k) noexcept - { - return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); - } - - /** - * @brief Returns the initial slot for a given key `k`. - * - * If vector-load is enabled, the return slot is always even to avoid illegal memory access. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ const_iterator initial_slot( - cooperative_groups::thread_block_tile g, ProbeKey const& k) const noexcept - { - auto const hash_value = [&]() { - auto const tmp = hash_(k); - if constexpr (uses_vector_load()) { - // initial hash value is always even - return tmp + tmp % 2; - } - if constexpr (not uses_vector_load()) { return tmp; } - }(); - - auto const offset = [&]() { - if constexpr (uses_vector_load()) { return g.thread_rank() * vector_width; } - if constexpr (not uses_vector_load()) { return g.thread_rank(); } - }(); - - // Each CG accesses to a bucket of (`cg_size` * `vector_width`) - // slots if vector-load is used or `cg_size` slots otherwise - return &slots_[(hash_value + offset) % capacity_]; - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ iterator next_slot(iterator s) noexcept - { - return const_cast(cuda::std::as_const(*this).next_slot(s)); - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept - { - std::size_t index = s - slots_; - std::size_t offset; - if constexpr (uses_vector_load()) { - offset = cg_size * vector_width; - } else { - offset = cg_size; - } - return &slots_[(index + offset) % capacity_]; - } - - private: - Hash hash_; ///< The unary callable used to hash the key -}; // class linear_probing - -/** - * @brief Cooperative Groups based double hashing scheme. - * - * Default probe sequence for `cuco::static_multimap`. Double hashing shows superior - * performance when dealing with high multiplicty and/or high occupancy use cases. Performance - * hints: - * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise. - * - * `Hash1` and `Hash2` should be callable object type. - * - * @tparam Key Type used for keys - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which multimap operations will be performed by - * individual threads - * @tparam VectorWidth Length of vector load - * @tparam CGSize Size of CUDA Cooperative Groups - * @tparam Hash1 Unary callable type - * @tparam Hash2 Unary callable type - */ -template -class double_hashing_impl - : public probe_sequence_impl_base { - public: - using probe_sequence_impl_base_type = - probe_sequence_impl_base; - using value_type = typename probe_sequence_impl_base_type::value_type; - using key_type = typename probe_sequence_impl_base_type::key_type; - using mapped_type = typename probe_sequence_impl_base_type::mapped_type; - using atomic_key_type = typename probe_sequence_impl_base_type::atomic_key_type; - using atomic_mapped_type = typename probe_sequence_impl_base_type::atomic_mapped_type; - using pair_atomic_type = typename probe_sequence_impl_base_type::pair_atomic_type; - using iterator = typename probe_sequence_impl_base_type::iterator; - using const_iterator = typename probe_sequence_impl_base_type::const_iterator; - - using probe_sequence_impl_base_type::capacity_; - using probe_sequence_impl_base_type::cg_size; - using probe_sequence_impl_base_type::slots_; - using probe_sequence_impl_base_type::uses_vector_load; - using probe_sequence_impl_base_type::vector_width; - - /** - * @brief Constructs a double hashing scheme based on the given hash map features. - * - * `hash2` takes a different seed to reduce the chance of secondary clustering. - * - * @param slots Pointer to beginning of the hash map slots - * @param capacity Capacity of the hash map - * @param hash1 First hasher to hash each key - * @param hash2 Second hasher to determine step size - */ - __host__ __device__ explicit double_hashing_impl(iterator slots, std::size_t capacity) - : probe_sequence_impl_base_type{slots, capacity}, - hash1_{Hash1{}}, - hash2_{Hash2{1}}, - step_size_{} - { - } - - /** - * @brief Returns the initial slot for a given key `k`. - * - * If vector-load is enabled, the return slot is always a multiple of (`cg_size` * `vector_width`) - * to avoid illegal memory access. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ iterator initial_slot( - cooperative_groups::thread_block_tile g, ProbeKey const& k) noexcept - { - return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); - } - - /** - * @brief Returns the initial slot for a given key `k`. - * - * If vector-load is enabled, the return slot is always a multiple of (`cg_size` * `vector_width`) - * to avoid illegal memory access. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ const_iterator initial_slot( - cooperative_groups::thread_block_tile g, ProbeKey const& k) const noexcept - { - std::size_t index; - auto const hash_value = hash1_(k); - if constexpr (uses_vector_load()) { - // step size in range [1, prime - 1] * cg_size * vector_width - step_size_ = - (hash2_(k) % (capacity_ / (cg_size * vector_width) - 1) + 1) * cg_size * vector_width; - index = hash_value % (capacity_ / (cg_size * vector_width)) * cg_size * vector_width + - g.thread_rank() * vector_width; - } else { - // step size in range [1, prime - 1] * cg_size - step_size_ = (hash2_(k) % (capacity_ / cg_size - 1) + 1) * cg_size; - index = (hash_value + g.thread_rank()) % capacity_; - } - return slots_ + index; - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ iterator next_slot(iterator s) noexcept - { - return const_cast(cuda::std::as_const(*this).next_slot(s)); - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept - { - std::size_t index = s - slots_; - return &slots_[(index + step_size_) % capacity_]; - } - - private: - Hash1 hash1_; ///< The first unary callable used to hash the key - Hash2 hash2_; ///< The second unary callable used to determine step size - mutable std::size_t step_size_; ///< The step stride when searching for the next slot -}; // class double_hashing - -/** - * @brief Probe sequence used internally by hash map. - * - * @tparam ProbeImpl Type of probe sequence implementation - * @tparam Key Type used for keys - * @tparam Value Type of the mapped values - * @tparam Scope The scope in which multimap operations will be performed by - * individual threads - */ -template -class probe_sequence : public ProbeImpl::template impl { - public: - using impl_type = - typename ProbeImpl::template impl; ///< Type of implementation details - - /** - * @brief Constructs a probe sequence based on the given hash map features. - * - * @param slots Pointer to beginning of the hash map slots - * @param capacity Capacity of the hash map - */ - __host__ __device__ explicit probe_sequence(typename impl_type::iterator slots, - std::size_t capacity) - : impl_type{slots, capacity} - { - } -}; // class probe_sequence - -} // namespace cuco::legacy::detail diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl deleted file mode 100644 index f97f3437e..000000000 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ /dev/null @@ -1,1569 +0,0 @@ -/* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include - -#include -#include - -namespace cuco { -template -class static_multimap::device_view_impl_base { - protected: - // Import member type definitions from `static_multimap` - using value_type = value_type; - using key_type = Key; - using mapped_type = Value; - using iterator = pair_atomic_type*; - using const_iterator = pair_atomic_type const*; - using probe_sequence_type = probe_sequence_type; - - /** - * @brief Indicates if vector-load is used. - * - * Users have no explicit control on whether vector-load is used. - * - * @return Boolean indicating if vector-load is used. - */ - static constexpr bool uses_vector_load() noexcept - { - return probe_sequence_type::uses_vector_load(); - } - - /** - * @brief Returns the number of pairs loaded with each vector-load - */ - static constexpr uint32_t vector_width() noexcept { return probe_sequence_type::vector_width(); } - - __host__ __device__ device_view_impl_base(pair_atomic_type* slots, - std::size_t capacity, - Key empty_key_sentinel, - Value empty_value_sentinel) noexcept - : probe_sequence_{slots, capacity}, - empty_key_sentinel_{empty_key_sentinel}, - empty_value_sentinel_{empty_value_sentinel} - { - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * To be used for Cooperative Group based probing. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ iterator - initial_slot(cooperative_groups::thread_block_tile g, - ProbeKey const& k) noexcept - { - return probe_sequence_.initial_slot(g, k); - } - - /** - * @brief Returns the initial slot for a given key `k` - * - * To be used for Cooperative Group based probing. - * - * @tparam ProbeKey Probe key type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g the Cooperative Group for which the initial slot is needed - * @param k The key to get the slot for - * @return Pointer to the initial slot for `k` - */ - template - __device__ __forceinline__ const_iterator - initial_slot(cooperative_groups::thread_block_tile g, - ProbeKey const& k) const noexcept - { - return probe_sequence_.initial_slot(g, k); - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. To - * be used for Cooperative Group based probing. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ iterator next_slot(iterator s) noexcept - { - return probe_sequence_.next_slot(s); - } - - /** - * @brief Given a slot `s`, returns the next slot. - * - * If `s` is the last slot, wraps back around to the first slot. To - * be used for Cooperative Group based probing. - * - * @param s The slot to advance - * @return The next slot after `s` - */ - __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept - { - return probe_sequence_.next_slot(s); - } - - /** - * @brief Load two key/value pairs from the given slot to the target pair array. - * - * @param arr The pair array to be loaded - * @param current_slot The given slot to load from - */ - __device__ __forceinline__ void load_pair_array(value_type* arr, - const_iterator current_slot) const noexcept - { - if constexpr (sizeof(value_type) == 4) { - auto const tmp = *reinterpret_cast(current_slot); - memcpy(&arr[0], &tmp, 2 * sizeof(value_type)); - } else { - auto const tmp = *reinterpret_cast(current_slot); - memcpy(&arr[0], &tmp, 2 * sizeof(value_type)); - } - } - - public: - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept - { - return empty_key_sentinel_; - } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept - { - return empty_value_sentinel_; - } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type* get_slots() noexcept - { - return probe_sequence_.get_slots(); - } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept - { - return probe_sequence_.get_slots(); - } - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept - { - return probe_sequence_.get_capacity(); - } - - private: - probe_sequence_type probe_sequence_; ///< Probe sequence used to probe the hash map - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial Value of empty slot -}; // class device_view_impl_base - -template -class static_multimap::device_mutable_view_impl - : public device_view_impl_base { - public: - using value_type = typename device_view_impl_base::value_type; - using key_type = typename device_view_impl_base::key_type; - using mapped_type = typename device_view_impl_base::mapped_type; - using iterator = typename device_view_impl_base::iterator; - using const_iterator = typename device_view_impl_base::const_iterator; - - private: - /** - * @brief Enumeration of the possible results of attempting to insert into a hash slot. - */ - enum class insert_result { - CONTINUE, ///< Insert did not succeed, continue trying to insert - SUCCESS, ///< New pair inserted successfully - DUPLICATE ///< Insert did not succeed, key is already present - }; - - /** - * @brief Inserts the specified key/value pair with one single CAS operation. - * - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @param key_equal The binary callable used to compare two keys for - * equality - * @return An insert result from the `insert_resullt` enumeration. - */ - __device__ __forceinline__ insert_result packed_cas(iterator current_slot, - value_type const& insert_pair) noexcept - { - auto expected_key = this->get_empty_key_sentinel(); - auto expected_value = this->get_empty_value_sentinel(); - - cuco::detail::pair_converter expected_pair{ - cuco::make_pair(expected_key, expected_value)}; - cuco::detail::pair_converter new_pair{insert_pair}; - - auto slot = reinterpret_cast< - cuda::atomic::packed_type, Scope>*>( - current_slot); - - bool success = slot->compare_exchange_strong( - expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed); - if (success) { return insert_result::SUCCESS; } - - return insert_result::CONTINUE; - } - - /** - * @brief Inserts the specified key/value pair with two back-to-back CAS operations. - * - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @return An insert result from the `insert_resullt` enumeration. - */ - __device__ __forceinline__ insert_result back_to_back_cas(iterator current_slot, - value_type const& insert_pair) noexcept - { - using cuda::std::memory_order_relaxed; - - auto expected_key = this->get_empty_key_sentinel(); - auto expected_value = this->get_empty_value_sentinel(); - - // Back-to-back CAS for 8B/8B key/value pairs - auto& slot_key = current_slot->first; - auto& slot_value = current_slot->second; - - bool key_success = - slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - bool value_success = - slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed); - - if (key_success) { - while (not value_success) { - value_success = - slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(), - insert_pair.second, - memory_order_relaxed); - } - return insert_result::SUCCESS; - } else if (value_success) { - slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed); - } - - return insert_result::CONTINUE; - } - - /** - * @brief Inserts the specified key/value pair with a CAS of the key and a dependent write - * of the value. - * - * @param current_slot The slot to insert - * @param insert_pair The pair to insert - * @return An insert result from the `insert_resullt` enumeration. - */ - __device__ __forceinline__ insert_result - cas_dependent_write(iterator current_slot, value_type const& insert_pair) noexcept - { - using cuda::std::memory_order_relaxed; - auto expected_key = this->get_empty_key_sentinel(); - - auto& slot_key = current_slot->first; - - auto const key_success = - slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed); - - if (key_success) { - auto& slot_value = current_slot->second; - slot_value.store(insert_pair.second, memory_order_relaxed); - return insert_result::SUCCESS; - } - - return insert_result::CONTINUE; - } - - public: - __host__ __device__ device_mutable_view_impl(pair_atomic_type* slots, - std::size_t capacity, - Key empty_key_sentinel, - Value empty_value_sentinel) noexcept - : device_view_impl_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Inserts the specified key/value pair into the map using vector loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam CG Cooperative Group type - * - * @param g The Cooperative Group that performs the insert - * @param insert_pair The pair to insert - * @return void. - */ - template - __device__ __forceinline__ cuda::std::enable_if_t insert( - CG g, value_type const& insert_pair) noexcept - { - auto current_slot = initial_slot(g, insert_pair.first); - while (true) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const first_slot_is_empty = - (detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel())); - auto const second_slot_is_empty = - (detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel())); - auto const bucket_contains_empty = g.ballot(first_slot_is_empty or second_slot_is_empty); - - if (bucket_contains_empty) { - // the first lane in the group with an empty slot will attempt the insert - insert_result status{insert_result::CONTINUE}; - uint32_t src_lane = __ffs(bucket_contains_empty) - 1; - if (g.thread_rank() == src_lane) { - auto insert_location = first_slot_is_empty ? current_slot : current_slot + 1; - // One single CAS operation since vector loads are dedicated to packable pairs - status = packed_cas(insert_location, insert_pair); - } - - // successful insert - if (g.any(status == insert_result::SUCCESS)) { return; } - // if we've gotten this far, a different key took our spot - // before we could insert. We need to retry the insert on the - // same bucket - } - // if there are no empty slots in the current bucket, - // we move onto the next bucket - else { - current_slot = next_slot(current_slot); - } - } // while true - } - - /** - * @brief Inserts the specified key/value pair into the map using scalar loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam CG Cooperative Group type - * - * @param g The Cooperative Group that performs the insert - * @param insert_pair The pair to insert - * @return void. - */ - template - __device__ __forceinline__ cuda::std::enable_if_t insert( - CG g, value_type const& insert_pair) noexcept - { - auto current_slot = this->initial_slot(g, insert_pair.first); - - while (true) { - value_type slot_contents = *reinterpret_cast(current_slot); - auto const& existing_key = slot_contents.first; - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = - detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - auto const bucket_contains_empty = g.ballot(slot_is_empty); - - if (bucket_contains_empty) { - // the first lane in the group with an empty slot will attempt the insert - insert_result status{insert_result::CONTINUE}; - uint32_t src_lane = __ffs(bucket_contains_empty) - 1; - - if (g.thread_rank() == src_lane) { -#if (__CUDA_ARCH__ < 700) - status = cas_dependent_write(current_slot, insert_pair); -#else - status = back_to_back_cas(current_slot, insert_pair); -#endif - } - - // successful insert - if (g.any(status == insert_result::SUCCESS)) { return; } - // if we've gotten this far, a different key took our spot - // before we could insert. We need to retry the insert on the - // same bucket - } - // if there are no empty slots in the current bucket, - // we move onto the next bucket - else { - current_slot = this->next_slot(current_slot); - } - } // while true - } -}; // class device_mutable_view_impl - -template -class static_multimap::device_view_impl - : public device_view_impl_base { - public: - using value_type = typename device_view_impl_base::value_type; - using key_type = typename device_view_impl_base::key_type; - using mapped_type = typename device_view_impl_base::mapped_type; - using iterator = typename device_view_impl_base::iterator; - using const_iterator = typename device_view_impl_base::const_iterator; - - __host__ __device__ device_view_impl(pair_atomic_type* slots, - std::size_t capacity, - Key empty_key_sentinel, - Value empty_value_sentinel) noexcept - : device_view_impl_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Flushes per-CG buffer into the output sequence. - * - * A given CUDA Cooperative Group, `g`, loads `num_outputs` key-value pairs from `output_buffer` - * and writes them into global memory in a coalesced fashion. CG-wide `memcpy_sync` is used if - * `thrust::is_contiguous_iterator_v` returns true. All threads of `g` must be active - * due to implicit CG-wide synchronization during flushing. - * - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @param g The Cooperative Group used to flush output buffer - * @param num_outputs Number of valid output in the buffer - * @param output_buffer Buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - */ - template - __device__ __forceinline__ void flush_output_buffer(CG g, - uint32_t const num_outputs, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin) noexcept - { - std::size_t offset; - const auto lane_id = g.thread_rank(); - if (0 == lane_id) { - offset = num_matches->fetch_add(num_outputs, cuda::std::memory_order_relaxed); - } - offset = g.shfl(offset, 0); - - if constexpr (thrust::is_contiguous_iterator_v) { -#if defined(CUCO_HAS_CUDA_BARRIER) - cooperative_groups::memcpy_async( - g, - &thrust::raw_reference_cast(*(output_begin + offset)), - output_buffer, - cuda::aligned_size_t(sizeof(value_type) * num_outputs)); -#else - cooperative_groups::memcpy_async(g, - &thrust::raw_reference_cast(*(output_begin + offset)), - output_buffer, - sizeof(value_type) * num_outputs); -#endif // end CUCO_HAS_CUDA_BARRIER - } else { - for (auto index = lane_id; index < num_outputs; index += g.size()) { - cuda::std::get<0>(*(output_begin + offset + index)) = output_buffer[index].first; - cuda::std::get<1>(*(output_begin + offset + index)) = output_buffer[index].second; - } - } - } - - /** - * @brief Flushes per-CG buffer into the output sequences. - * - * A given CUDA Cooperative Group, `g`, loads `num_outputs` elements from `probe_output_buffer` - * and `num_outputs` elements from `contained_output_buffer`, then writes them into global - * memory started from `probe_output_begin` and `contained_output_begin` respectively. All - * threads of `g` must be active due to implicit CG-wide synchronization during flushing. - * - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @param g The Cooperative Group used to flush output buffer - * @param num_outputs Number of valid output in the buffer - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - */ - template - __device__ __forceinline__ void flush_output_buffer(CG g, - uint32_t const num_outputs, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin) noexcept - { - std::size_t offset; - const auto lane_id = g.thread_rank(); - if (0 == lane_id) { - offset = num_matches->fetch_add(num_outputs, cuda::std::memory_order_relaxed); - } - offset = g.shfl(offset, 0); - - for (auto index = lane_id; index < num_outputs; index += g.size()) { - auto& probe_pair = probe_output_buffer[index]; - auto& contained_pair = contained_output_buffer[index]; - cuda::std::get<0>(*(probe_output_begin + offset + index)) = probe_pair.first; - cuda::std::get<1>(*(probe_output_begin + offset + index)) = probe_pair.second; - cuda::std::get<0>(*(contained_output_begin + offset + index)) = contained_pair.first; - cuda::std::get<1>(*(contained_output_begin + offset + index)) = contained_pair.second; - } - } - - /** - * @brief Indicates whether the probe `element` exists in the map using vector loads. - * - * If `element` was inserted into the map, `contains` returns true. Otherwise, it returns false. - * Uses the CUDA Cooperative Groups API to leverage multiple threads to perform a single - * `contains` operation. This provides a significant boost in throughput compared to the non - * Cooperative Group based `contains` at moderate to high load factors. - * - * @tparam is_pair_contains `true` if it's a `pair_contains` implementation - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam ProbeT Probe data type - * @tparam Equal Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the contains operation - * @param element The probe element to search for - * @param equal The binary function to compare input element and slot content for equality - * @return A boolean indicating whether the key/value pair represented by `element` was inserted - */ - template - __device__ __forceinline__ cuda::std::enable_if_t contains( - cooperative_groups::thread_block_tile g, - ProbeT const& element, - Equal equal) const noexcept - { - auto current_slot = [&]() { - if constexpr (is_pair_contains) { return initial_slot(g, element.first); } - if constexpr (not is_pair_contains) { return initial_slot(g, element); } - }(); - - while (true) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - auto const first_equals = [&]() { - if constexpr (is_pair_contains) { - return not first_slot_is_empty and equal(arr[0], element); - } - if constexpr (not is_pair_contains) { - return not first_slot_is_empty and equal(arr[0].first, element); - } - }(); - auto const second_equals = [&]() { - if constexpr (is_pair_contains) { - return not second_slot_is_empty and equal(arr[1], element); - } - if constexpr (not is_pair_contains) { - return not second_slot_is_empty and equal(arr[1].first, element); - } - }(); - - // the key we were searching for was found by one of the threads, so we return true - if (g.any(first_equals or second_equals)) { return true; } - - // we found an empty slot, meaning that the key we're searching for isn't present - if (g.any(first_slot_is_empty or second_slot_is_empty)) { return false; } - - // otherwise, all slots in the current bucket are full with other keys, so we move onto the - // next bucket - current_slot = next_slot(current_slot); - } - } - - /** - * @brief Indicates whether `element` exists in the map using scalar loads. - * - * If `element` was inserted into the map, `contains` returns true. Otherwise, it returns false. - * Uses the CUDA Cooperative Groups API to leverage multiple threads to perform a single - * `contains` operation. This provides a significant boost in throughput compared to the non - * Cooperative Group `contains` at moderate to high load factors. - * - * @tparam is_pair_contains `true` if it's a `pair_contains` implementation - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam ProbeT Probe data type - * @tparam Equal Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the contains operation - * @param element The probe element to search for - * @param equal The binary function to compare input element and slot content for equality - * @return A boolean indicating whether the key/value pair represented by `element` was inserted - */ - template - __device__ __forceinline__ cuda::std::enable_if_t contains( - cooperative_groups::thread_block_tile g, - ProbeT const& element, - Equal equal) const noexcept - { - auto current_slot = [&]() { - if constexpr (is_pair_contains) { return this->initial_slot(g, element.first); } - if constexpr (not is_pair_contains) { return this->initial_slot(g, element); } - }(); - - while (true) { - value_type slot_contents = *reinterpret_cast(current_slot); - auto const& existing_key = slot_contents.first; - - // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as - // the sentinel is not a valid key value. Therefore, first check for the sentinel - auto const slot_is_empty = - detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()); - - auto const equals = [&]() { - if constexpr (is_pair_contains) { - return not slot_is_empty and equal(slot_contents, element); - } - if constexpr (not is_pair_contains) { - return not slot_is_empty and equal(existing_key, element); - } - }(); - - // the key we were searching for was found by one of the threads, so we return true - if (g.any(equals)) { return true; } - - // we found an empty slot, meaning that the key we're searching for isn't present - if (g.any(slot_is_empty)) { return false; } - - // otherwise, all slots in the current bucket are full with other keys, so we move onto the - // next bucket - current_slot = this->next_slot(current_slot); - } - } - - /** - * @brief Counts the occurrence of a given key contained in multimap using vector loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam KeyEqual Binary callable type - * @param g The Cooperative Group used to perform the count operation - * @param k The key to search for - * @param key_equal The binary callable used to compare two keys - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ cuda::std::enable_if_t count( - CG g, Key const& k, KeyEqual key_equal) noexcept - { - std::size_t count = 0; - auto current_slot = initial_slot(g, k); - - [[maybe_unused]] bool found_match = false; - - while (true) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - auto const first_equals = (not first_slot_is_empty and key_equal(arr[0].first, k)); - auto const second_equals = (not second_slot_is_empty and key_equal(arr[1].first, k)); - - if constexpr (is_outer) { - if (g.any(first_equals or second_equals)) { found_match = true; } - } - - count += (first_equals + second_equals); - - if (g.any(first_slot_is_empty or second_slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) && (g.thread_rank() == 0)) { count++; } - } - return count; - } - - current_slot = next_slot(current_slot); - } - } - - /** - * @brief Counts the occurrence of a given key contained in multimap using scalar loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam KeyEqual Binary callable type - * @param g The Cooperative Group used to perform the count operation - * @param k The key to search for - * @param key_equal The binary callable used to compare two keys - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ cuda::std::enable_if_t count( - CG g, Key const& k, KeyEqual key_equal) noexcept - { - std::size_t count = 0; - auto current_slot = initial_slot(g, k); - - [[maybe_unused]] bool found_match = false; - - while (true) { - value_type slot_contents = *reinterpret_cast(current_slot); - auto const& current_key = slot_contents.first; - - auto const slot_is_empty = - detail::bitwise_compare(current_key, this->get_empty_key_sentinel()); - auto const equals = not slot_is_empty and key_equal(current_key, k); - - if constexpr (is_outer) { - if (g.any(equals)) { found_match = true; } - } - - count += equals; - - if (g.any(slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) && (g.thread_rank() == 0)) { count++; } - } - return count; - } - - current_slot = next_slot(current_slot); - } - } - - /** - * @brief Counts the occurrence of a given key/value pair contained in multimap using vector - * loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam PairEqual Binary callable type - * @param g The Cooperative Group used to perform the pair_count operation - * @param pair The pair to search for - * @param pair_equal The binary callable used to compare two pairs - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ cuda::std::enable_if_t pair_count( - CG g, value_type const& pair, PairEqual pair_equal) noexcept - { - std::size_t count = 0; - auto key = pair.first; - auto current_slot = initial_slot(g, key); - - [[maybe_unused]] bool found_match = false; - - while (true) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - - auto const first_slot_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); - auto const second_slot_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); - - if constexpr (is_outer) { - if (g.any(first_slot_equals or second_slot_equals)) { found_match = true; } - } - - count += (first_slot_equals + second_slot_equals); - - if (g.any(first_slot_is_empty or second_slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) && (g.thread_rank() == 0)) { count++; } - } - return count; - } - - current_slot = next_slot(current_slot); - } - } - - /** - * @brief Counts the occurrence of a given key/value pair contained in multimap using scalar - * loads. - * - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam PairEqual Binary callable type - * @param g The Cooperative Group used to perform the pair_count operation - * @param pair The pair to search for - * @param pair_equal The binary callable used to compare two pairs - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ cuda::std::enable_if_t pair_count( - CG g, value_type const& pair, PairEqual pair_equal) noexcept - { - std::size_t count = 0; - auto key = pair.first; - auto current_slot = initial_slot(g, key); - - [[maybe_unused]] bool found_match = false; - - while (true) { - auto slot_contents = *reinterpret_cast(current_slot); - - auto const slot_is_empty = - detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - - auto const equals = not slot_is_empty and pair_equal(slot_contents, pair); - - if constexpr (is_outer) { - if (g.any(equals)) { found_match = true; } - } - - count += equals; - - if (g.any(slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) && (g.thread_rank() == 0)) { count++; } - } - return count; - } - - current_slot = next_slot(current_slot); - } - } - - /** - * @brief Retrieves all the matches of a given key contained in multimap using vector - * loads with per-flushing-CG shared memory buffer. - * - * For key `k` existing in the map, copies `k` and all associated values to unspecified - * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and - * `empty_value_sentinel()` into the output only if `is_outer` is true. - * - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param k The key to search for - * @param flushing_cg_counter Pointer to the flushing cg counter - * @param output_buffer Shared memory buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - * @param key_equal The binary callable used to compare two keys - * for equality - */ - template - __device__ __forceinline__ void retrieve(FlushingCG flushing_cg, - ProbingCG probing_cg, - Key const& k, - uint32_t* flushing_cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal) noexcept - { - const uint32_t cg_lane_id = probing_cg.thread_rank(); - - auto current_slot = initial_slot(probing_cg, k); - - bool running = true; - [[maybe_unused]] bool found_match = false; - - while (flushing_cg.any(running)) { - if (running) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - auto const first_equals = (not first_slot_is_empty and key_equal(arr[0].first, k)); - auto const second_equals = (not second_slot_is_empty and key_equal(arr[1].first, k)); - auto const first_exists = probing_cg.ballot(first_equals); - auto const second_exists = probing_cg.ballot(second_equals); - - if (first_exists or second_exists) { - if constexpr (is_outer) { found_match = true; } - - auto const num_first_matches = __popc(first_exists); - auto const num_second_matches = __popc(second_exists); - - uint32_t output_idx; - if (0 == cg_lane_id) { - output_idx = atomicAdd(flushing_cg_counter, (num_first_matches + num_second_matches)); - } - output_idx = probing_cg.shfl(output_idx, 0); - - if (first_equals) { - auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); - output_buffer[output_idx + lane_offset] = cuco::make_pair(k, arr[0].second); - } - if (second_equals) { - auto const lane_offset = - detail::count_least_significant_bits(second_exists, cg_lane_id); - output_buffer[output_idx + num_first_matches + lane_offset] = - cuco::make_pair(k, arr[1].second); - } - } - if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { - running = false; - if constexpr (is_outer) { - if ((not found_match) && (cg_lane_id == 0)) { - auto const output_idx = atomicAdd(flushing_cg_counter, 1); - output_buffer[output_idx] = cuco::make_pair(k, this->get_empty_value_sentinel()); - } - } - } - } // if running - - flushing_cg.sync(); - if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) { - flush_output_buffer( - flushing_cg, *flushing_cg_counter, output_buffer, num_matches, output_begin); - // Everyone in the group reads the counter when flushing, so - // sync before writing. - flushing_cg.sync(); - // First lane reset warp-level counter - if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } - flushing_cg.sync(); - } - - current_slot = next_slot(current_slot); - } // while running - } - - /** - * @brief Retrieves all the matches of a given key contained in multimap using scalar - * loads with per-CG shared memory buffer. - * - * For key `k` existing in the map, copies `k` and all associated values to unspecified - * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and - * `empty_value_sentinel()` into the output only if `is_outer` is true. - * - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @param g The Cooperative Group used to retrieve - * @param k The key to search for - * @param cg_counter Pointer to the CG counter - * @param output_buffer Shared memory buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - * @param key_equal The binary callable used to compare two keys - * for equality - */ - template - __device__ __forceinline__ void retrieve(CG g, - Key const& k, - uint32_t* cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal) noexcept - { - const uint32_t lane_id = g.thread_rank(); - - auto current_slot = initial_slot(g, k); - - bool running = true; - [[maybe_unused]] bool found_match = false; - - while (running) { - // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation - // is unsafe! - static_assert(sizeof(Key) == sizeof(cuda::atomic)); - static_assert(sizeof(Value) == sizeof(cuda::atomic)); - value_type slot_contents = *reinterpret_cast(current_slot); - - auto const slot_is_empty = - detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k)); - auto const exists = g.ballot(equals); - - uint32_t output_idx = *cg_counter; - - if (exists) { - if constexpr (is_outer) { found_match = true; } - auto const num_matches = __popc(exists); - if (equals) { - // Each match computes its lane-level offset - auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); - output_buffer[output_idx + lane_offset] = cuco::make_pair(k, slot_contents.second); - } - if (0 == lane_id) { (*cg_counter) += num_matches; } - } - if (g.any(slot_is_empty)) { - running = false; - if constexpr (is_outer) { - if ((not found_match) && (lane_id == 0)) { - output_idx = (*cg_counter)++; - output_buffer[output_idx] = cuco::make_pair(k, this->get_empty_value_sentinel()); - } - } - } - - g.sync(); - - // Flush if the next iteration won't fit into buffer - if ((*cg_counter + g.size()) > buffer_size) { - flush_output_buffer(g, *cg_counter, output_buffer, num_matches, output_begin); - // Everyone in the group reads the counter when flushing, so - // sync before writing. - g.sync(); - // First lane reset CG-level counter - if (lane_id == 0) { *cg_counter = 0; } - g.sync(); - } - current_slot = next_slot(current_slot); - } // while running - } - - /** - * @brief Retrieves all the matches of a given pair using vector loads. - * - * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores - * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = - * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where - * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, - * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and - * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. - * - * Concurrent reads or writes to any of the output ranges results in undefined behavior. - * - * Behavior is undefined if the extent of any of the output ranges is less than `n`. - * - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Key` type. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Value` type. - * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from - * the map's `key_type`. - * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from - * the map's `mapped_type`. - * @tparam PairEqual Binary callable type - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param probe_key_begin Beginning of the output sequence of the matched probe keys - * @param probe_val_begin Beginning of the output sequence of the matched probe values - * @param contained_key_begin Beginning of the output sequence of the matched contained keys - * @param contained_val_begin Beginning of the output sequence of the matched contained values - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ cuda::std::enable_if_t pair_retrieve( - ProbingCG probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept - { - auto const lane_id = probing_cg.thread_rank(); - auto current_slot = initial_slot(probing_cg, pair.first); - [[maybe_unused]] auto found_match = false; - - auto num_matches = 0; - - while (true) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - auto const first_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); - auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); - auto const first_exists = probing_cg.ballot(first_equals); - auto const second_exists = probing_cg.ballot(second_equals); - - if (first_exists or second_exists) { - if constexpr (is_outer) { found_match = true; } - - auto const num_first_matches = __popc(first_exists); - - if (first_equals) { - auto lane_offset = detail::count_least_significant_bits(first_exists, lane_id); - auto const output_idx = num_matches + lane_offset; - - *(probe_key_begin + output_idx) = pair.first; - *(probe_val_begin + output_idx) = pair.second; - *(contained_key_begin + output_idx) = arr[0].first; - *(contained_val_begin + output_idx) = arr[0].second; - } - if (second_equals) { - auto const lane_offset = detail::count_least_significant_bits(second_exists, lane_id); - auto const output_idx = num_matches + num_first_matches + lane_offset; - - *(probe_key_begin + output_idx) = pair.first; - *(probe_val_begin + output_idx) = pair.second; - *(contained_key_begin + output_idx) = arr[1].first; - *(contained_val_begin + output_idx) = arr[1].second; - } - num_matches += (num_first_matches + __popc(second_exists)); - } - if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) and lane_id == 0) { - *(probe_key_begin) = pair.first; - *(probe_val_begin) = pair.second; - *(contained_key_begin) = this->get_empty_key_sentinel(); - *(contained_val_begin) = this->get_empty_value_sentinel(); - } - } - return; // exit if any slot in the current bucket is empty - } - - current_slot = next_slot(current_slot); - } // while - } - - /** - * @brief Retrieves all the matches of a given pair using scalar loads. - * - * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores - * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] = - * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where - * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`, - * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and - * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true. - * - * Concurrent reads or writes to any of the output ranges results in undefined behavior. - * - * Behavior is undefined if the extent of any of the output ranges is less than `n`. - * - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used - * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Key` type. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Value` type. - * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from - * the map's `key_type`. - * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from - * the map's `mapped_type`. - * @tparam PairEqual Binary callable type - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param probe_key_begin Beginning of the output sequence of the matched probe keys - * @param probe_val_begin Beginning of the output sequence of the matched probe values - * @param contained_key_begin Beginning of the output sequence of the matched contained keys - * @param contained_val_begin Beginning of the output sequence of the matched contained values - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ cuda::std::enable_if_t pair_retrieve( - ProbingCG probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept - { - auto const lane_id = probing_cg.thread_rank(); - auto current_slot = initial_slot(probing_cg, pair.first); - [[maybe_unused]] auto found_match = false; - - auto num_matches = 0; - - while (true) { - // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation - // is unsafe! - static_assert(sizeof(Key) == sizeof(cuda::atomic)); - static_assert(sizeof(Value) == sizeof(cuda::atomic)); - value_type slot_contents = *reinterpret_cast(current_slot); - - auto const slot_is_empty = - detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); - auto const exists = probing_cg.ballot(equals); - - if (exists) { - if constexpr (is_outer) { found_match = true; } - - if (equals) { - auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); - auto const output_idx = num_matches + lane_offset; - - *(probe_key_begin + output_idx) = pair.first; - *(probe_val_begin + output_idx) = pair.second; - *(contained_key_begin + output_idx) = slot_contents.first; - *(contained_val_begin + output_idx) = slot_contents.second; - } - num_matches += __popc(exists); - } - if (probing_cg.any(slot_is_empty)) { - if constexpr (is_outer) { - if ((not found_match) and lane_id == 0) { - *(probe_key_begin) = pair.first; - *(probe_val_begin) = pair.second; - *(contained_key_begin) = this->get_empty_key_sentinel(); - *(contained_val_begin) = this->get_empty_value_sentinel(); - } - } - return; // exit if any slot in the current bucket is empty - } - - current_slot = next_slot(current_slot); - } // while - } - - /** - * @brief Retrieves all the matches of a given pair contained in multimap using vector - * loads with per-flushing-CG shared memory buffer. - * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if - * `is_outer` is true. - * - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam ProbingCG Type of Cooperative Group used to retrieve - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param flushing_cg_counter Pointer to the flushing CG counter - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve(FlushingCG flushing_cg, - ProbingCG probing_cg, - value_type const& pair, - uint32_t* flushing_cg_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept - { - const uint32_t cg_lane_id = probing_cg.thread_rank(); - - auto key = pair.first; - auto current_slot = initial_slot(probing_cg, key); - - bool running = true; - [[maybe_unused]] bool found_match = false; - - while (flushing_cg.any(running)) { - if (running) { - value_type arr[2]; - load_pair_array(&arr[0], current_slot); - - auto const first_slot_is_empty = - detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()); - auto const second_slot_is_empty = - detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()); - auto const first_equals = (not first_slot_is_empty and pair_equal(arr[0], pair)); - auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair)); - auto const first_exists = probing_cg.ballot(first_equals); - auto const second_exists = probing_cg.ballot(second_equals); - - if (first_exists or second_exists) { - if constexpr (is_outer) { found_match = true; } - - auto const num_first_matches = __popc(first_exists); - auto const num_second_matches = __popc(second_exists); - - uint32_t output_idx; - if (0 == cg_lane_id) { - output_idx = atomicAdd(flushing_cg_counter, (num_first_matches + num_second_matches)); - } - output_idx = probing_cg.shfl(output_idx, 0); - - if (first_equals) { - auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id); - probe_output_buffer[output_idx + lane_offset] = pair; - contained_output_buffer[output_idx + lane_offset] = arr[0]; - } - if (second_equals) { - auto const lane_offset = - detail::count_least_significant_bits(second_exists, cg_lane_id); - probe_output_buffer[output_idx + num_first_matches + lane_offset] = pair; - contained_output_buffer[output_idx + num_first_matches + lane_offset] = arr[1]; - } - } - if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) { - running = false; - if constexpr (is_outer) { - if ((not found_match) && (cg_lane_id == 0)) { - auto const output_idx = atomicAdd(flushing_cg_counter, 1); - probe_output_buffer[output_idx] = pair; - contained_output_buffer[output_idx] = - cuco::make_pair(this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); - } - } - } - } // if running - - flushing_cg.sync(); - if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) { - flush_output_buffer(flushing_cg, - *flushing_cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin); - // Everyone in the group reads the counter when flushing, so - // sync before writing. - flushing_cg.sync(); - // First lane reset warp-level counter - if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } - flushing_cg.sync(); - } - - current_slot = next_slot(current_slot); - } // while running - } - - /** - * @brief Retrieves all the matches of a given pair contained in multimap using scalar - * loads with per-CG shared memory buffer. - * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if - * `is_outer` is true. - * - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether outer join is peformed - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @param g The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param cg_counter Pointer to the CG counter - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve(CG g, - value_type const& pair, - uint32_t* cg_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept - { - const uint32_t lane_id = g.thread_rank(); - - auto key = pair.first; - auto current_slot = initial_slot(g, key); - - bool running = true; - [[maybe_unused]] bool found_match = false; - - while (running) { - // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation - // is unsafe! - static_assert(sizeof(Key) == sizeof(cuda::atomic)); - static_assert(sizeof(Value) == sizeof(cuda::atomic)); - value_type slot_contents = *reinterpret_cast(current_slot); - - auto const slot_is_empty = - detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel()); - auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair)); - auto const exists = g.ballot(equals); - - uint32_t output_idx = *cg_counter; - - if (exists) { - if constexpr (is_outer) { found_match = true; } - auto const num_matches = __popc(exists); - if (equals) { - // Each match computes its lane-level offset - auto const lane_offset = detail::count_least_significant_bits(exists, lane_id); - probe_output_buffer[output_idx + lane_offset] = pair; - contained_output_buffer[output_idx + lane_offset] = slot_contents; - } - if (0 == lane_id) { (*cg_counter) += num_matches; } - } - if (g.any(slot_is_empty)) { - running = false; - if constexpr (is_outer) { - if ((not found_match) && (lane_id == 0)) { - output_idx = (*cg_counter)++; - probe_output_buffer[output_idx] = pair; - contained_output_buffer[output_idx] = - cuco::make_pair(this->get_empty_key_sentinel(), this->get_empty_value_sentinel()); - } - } - } - - g.sync(); - - // Flush if the next iteration won't fit into buffer - if ((*cg_counter + g.size()) > buffer_size) { - flush_output_buffer(g, - *cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin); - // Everyone in the group reads the counter when flushing, so - // sync before writing. - g.sync(); - // First lane reset CG-level counter - if (lane_id == 0) { *cg_counter = 0; } - g.sync(); - } - current_slot = next_slot(current_slot); - } // while running - } -}; // class device_view_impl - -} // namespace cuco diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh deleted file mode 100644 index 1e7b9d985..000000000 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ /dev/null @@ -1,554 +0,0 @@ -/* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include -#include -#include - -namespace cuco { -namespace detail { -namespace cg = cooperative_groups; - -CUCO_SUPPRESS_KERNEL_WARNINGS -/** - * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. - * - * Each space in `slots` that can hold a key value pair is initialized to a - * `pair_atomic_type` containing the key `k` and the value `v`. - * - * @tparam atomic_key_type Type of the `Key` atomic container - * @tparam atomic_mapped_type Type of the `Value` atomic container - * @tparam Key key type - * @tparam Value value type - * @tparam pair_atomic_type key/value pair type - * - * @param slots Pointer to flat storage for the map's key/value pairs - * @param k Key to which all keys in `slots` are initialized - * @param v Value to which all values in `slots` are initialized - * @param size Size of the storage pointed to by `slots` - */ -template -CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) -{ - int64_t const loop_stride = gridDim.x * blockDim.x; - int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; - while (idx < size) { - new (&slots[idx].first) atomic_key_type{k}; - new (&slots[idx].second) atomic_mapped_type{v}; - idx += loop_stride; - } -} - -/** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform each - * key/value insertion. This provides a significant boost in throughput compared to the non - * Cooperative Group `insert` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible random access input iterator where - * `std::is_convertible::value_type, - * static_multimap::value_type>` is `true` - * @tparam viewT Type of device view allowing access of hash map storage - * - * @param first Beginning of the sequence of key/value pairs - * @param n Number of key/value pairs to insert - * @param view Mutable device view used to access the hash map's slot storage - */ -template -CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - while (idx < n) { - // force conversion to value_type - typename viewT::value_type const insert_pair{*(first + idx)}; - view.insert(tile, insert_pair); - idx += loop_stride; - } -} - -/** - * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` of the - * corresponding stencil returns true. - * - * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform each - * key/value insertion. This provides a significant boost in throughput compared to the non - * Cooperative Group `insert` at moderate to high load factors. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform - * inserts - * @tparam InputIt Device accessible random access input iterator where - * `std::is_convertible::value_type, - * static_multimap::value_type>` is `true` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from `std::iterator_traits::value_type`. - * - * @param first Beginning of the sequence of key/value pairs - * @param s Beginning of the stencil sequence - * @param n Number of elements to insert - * @param view Mutable device view used to access the hash map's slot storage - * @param pred Predicate to test on every element in the range `[s, s + n)` - */ -template -CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - while (idx < n) { - if (pred(*(s + idx))) { - typename viewT::value_type const insert_pair{*(first + idx)}; - // force conversion to value_type - view.insert(tile, insert_pair); - } - idx += loop_stride; - } -} - -/** - * @brief Indicates whether the elements in the range `[first, last)` are contained in the map. - * - * Stores `true` or `false` to `(output + i)` indicating if the element `*(first + i)` exists in the - * map. - * - * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the - * contains operation for each element. This provides a significant boost in throughput compared - * to the non Cooperative Group `contains` at moderate to high load factors. - * - * @tparam is_pair_contains `true` if it's a `pair_contains` implementation - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups - * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from `bool` - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam Equal Binary callable type - * - * @param first Beginning of the sequence of elements - * @param n Number of elements to query - * @param output_begin Beginning of the sequence of booleans for the presence of each element - * @param view Device view used to access the hash map's slot storage - * @param equal The binary function to compare input element and slot content for equality - */ -template -CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - __shared__ bool writeBuffer[block_size / tile_size]; - - while (idx < n) { - typename cuda::std::iterator_traits::value_type element = *(first + idx); - auto found = [&]() { - if constexpr (is_pair_contains) { return view.pair_contains(tile, element, equal); } - if constexpr (not is_pair_contains) { return view.contains(tile, element, equal); } - }(); - - /* - * The ld.relaxed.gpu instruction used in view.find causes L1 to - * flush more frequently, causing increased sector stores from L2 to global memory. - * By writing results to shared memory and then synchronizing before writing back - * to global, we no longer rely on L1, preventing the increase in sector stores from - * L2 to global and improving performance. - */ - if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } - __syncthreads(); - if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += loop_stride; - } -} - -/** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap. - * - * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by `key_equal(k, - * k')` and stores the sum of all matches for all keys to `num_matches`. If `k` does not have any - * matches, it contributes 1 to the final sum only if `is_outer` is true. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform counts - * @tparam uses_vector_load Boolean flag indicating whether vector loads are used or not - * @tparam is_outer Boolean flag indicating whether non-matches are counted - * @tparam InputIt Device accessible input iterator whose `value_type` is convertible to the map's - * `key_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam KeyEqual Binary callable - * - * @param first Beginning of the sequence of keys to count - * @param n Number of the keys to query - * @param num_matches The number of all the matches for a sequence of keys - * @param view Device view used to access the hash map's slot storage - * @param key_equal Binary function to compare two keys for equality - */ -template -CUCO_KERNEL void count( - InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_matches = 0; - - while (idx < n) { - auto key = *(first + idx); - if constexpr (is_outer) { - thread_num_matches += view.count_outer(tile, key, key_equal); - } else { - thread_num_matches += view.count(tile, key, key_equal); - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_matches = BlockReduce(temp_storage).Sum(thread_num_matches); - if (threadIdx.x == 0) { - num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap. - * - * For pair, `p = *(first + i)`, counts all matching pairs, `p'`, as determined by `pair_equal(p, - * p')` and stores the sum of all matches for all pairs to `num_matches`. If `p` does not have any - * matches, it contributes 1 to the final sum only if `is_outer` is true. - * - * @tparam block_size The size of the thread block - * @tparam tile_size The number of threads in the Cooperative Groups used to perform counts - * @tparam is_outer Boolean flag indicating whether non-matches are counted - * @tparam InputIt Device accessible random access input iterator where - * `std::is_convertible::value_type, - * static_multimap::value_type>` is `true` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam PairEqual Binary callable - * - * @param first Beginning of the sequence of pairs to count - * @param n Number of the pairs to query - * @param num_matches The number of all the matches for a sequence of pairs - * @param view Device view used to access the hash map's slot storage - * @param pair_equal Binary function to compare two pairs for equality - */ -template -CUCO_KERNEL void pair_count( - InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal) -{ - auto tile = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / tile_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t thread_num_matches = 0; - - while (idx < n) { - typename viewT::value_type const pair = *(first + idx); - if constexpr (is_outer) { - thread_num_matches += view.pair_count_outer(tile, pair, pair_equal); - } else { - thread_num_matches += view.pair_count(tile, pair, pair_equal); - } - idx += loop_stride; - } - - // compute number of successfully inserted elements for each block - // and atomically add to the grand total - std::size_t block_num_matches = BlockReduce(temp_storage).Sum(thread_num_matches); - if (threadIdx.x == 0) { - num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed); - } -} - -/** - * @brief Retrieves all the values corresponding to all keys in the range `[first, last)`. - * - * For key `k = *(first + i)` existing in the map, copies `k` and all associated values to - * unspecified locations in `[output_begin, output_end)`. If `k` does not have any matches, copies - * `k` and `empty_value_sentinel()` into the output only if `is_outer` is true. - * - * Behavior is undefined if the total number of matching keys exceeds `std::distance(output_begin, - * output_begin + *num_matches - 1)`. Use `count()` to determine the size of the output range. - * - * @tparam block_size The size of the thread block - * @tparam flushing_cg_size The size of the CG used to flush output buffers - * @tparam probing_cg_size The size of the CG for parallel retrievals - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether non-matches are included in the output - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam KeyEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of the keys to query - * @param output_begin Beginning of the sequence of values retrieved for each key - * @param num_matches Size of the output sequence - * @param view Device view used to access the hash map's slot storage - * @param key_equal The binary function to compare two keys for equality - */ -template -CUCO_KERNEL void retrieve(InputIt first, - int64_t n, - OutputIt output_begin, - atomicT* num_matches, - viewT view, - KeyEqual key_equal) -{ - using pair_type = typename viewT::value_type; - - constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; - const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - - auto flushing_cg = - cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; - - __shared__ pair_type output_buffer[num_flushing_cgs][buffer_size]; - // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization - // warning issue is solved __shared__ atomicT counter[num_flushing_cgs][buffer_size]; - __shared__ uint32_t flushing_cg_counter[num_flushing_cgs]; - - if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - - flushing_cg.sync(); - - while (flushing_cg.any(idx < n)) { - bool active_flag = idx < n; - auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); - - if (active_flag) { - auto key = *(first + idx); - if constexpr (is_outer) { - view.template retrieve_outer(active_flushing_cg, - probing_cg, - key, - &flushing_cg_counter[flushing_cg_id], - output_buffer[flushing_cg_id], - num_matches, - output_begin, - key_equal); - } else { - view.template retrieve(active_flushing_cg, - probing_cg, - key, - &flushing_cg_counter[flushing_cg_id], - output_buffer[flushing_cg_id], - num_matches, - output_begin, - key_equal); - } - } - idx += loop_stride; - } - - flushing_cg.sync(); - // Final flush of output buffer - if (flushing_cg_counter[flushing_cg_id] > 0) { - view.flush_output_buffer(flushing_cg, - flushing_cg_counter[flushing_cg_id], - output_buffer[flushing_cg_id], - num_matches, - output_begin); - } -} - -/** - * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. - * - * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to unspecified - * locations in `probe_output_begin`, and slot[j] is stored to unspecified locations in - * `contained_output_begin`. If the given pair has no matches in the map, copies *(first + i) in - * `probe_output_begin` and a pair of `empty_key_sentinel` and `empty_value_sentinel` in - * `contained_output_begin` only when `is_outer` is `true`. - * - * Behavior is undefined if the total number of matching pairs exceeds `std::distance(output_begin, - * output_begin + *num_matches - 1)`. Use `pair_count()` to determine the size of the output range. - * - * @tparam block_size The size of the thread block - * @tparam flushing_cg_size The size of the CG used to flush output buffers - * @tparam probing_cg_size The size of the CG for parallel retrievals - * @tparam buffer_size Size of the output buffer - * @tparam is_outer Boolean flag indicating whether non-matches are included in the output - * @tparam InputIt Device accessible random access input iterator where - * `std::is_convertible::value_type, - * static_multimap::value_type>` is `true` - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam atomicT Type of atomic storage - * @tparam viewT Type of device view allowing access of hash map storage - * @tparam PairEqual Binary callable type - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param probe_output_begin Beginning of the sequence of the matched probe pairs - * @param contained_output_begin Beginning of the sequence of the matched contained pairs - * @param num_matches Size of the output sequence - * @param view Device view used to access the hash map's slot storage - * @param pair_equal The binary function to compare two pairs for equality - */ -template -CUCO_KERNEL void pair_retrieve(InputIt first, - int64_t n, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - atomicT* num_matches, - viewT view, - PairEqual pair_equal) -{ - using pair_type = typename viewT::value_type; - - constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; - const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - - auto flushing_cg = - cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; - int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; - - __shared__ pair_type probe_output_buffer[num_flushing_cgs][buffer_size]; - __shared__ pair_type contained_output_buffer[num_flushing_cgs][buffer_size]; - // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization - // warning issue is solved __shared__ atomicT counter[num_flushing_cgs][buffer_size]; - __shared__ uint32_t flushing_cg_counter[num_flushing_cgs]; - - if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - - flushing_cg.sync(); - - while (flushing_cg.any(idx < n)) { - bool active_flag = idx < n; - auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); - - if (active_flag) { - pair_type pair = *(first + idx); - if constexpr (is_outer) { - view.pair_retrieve_outer(active_flushing_cg, - probing_cg, - pair, - &flushing_cg_counter[flushing_cg_id], - probe_output_buffer[flushing_cg_id], - contained_output_buffer[flushing_cg_id], - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } else { - view.pair_retrieve(active_flushing_cg, - probing_cg, - pair, - &flushing_cg_counter[flushing_cg_id], - probe_output_buffer[flushing_cg_id], - contained_output_buffer[flushing_cg_id], - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } - } - idx += loop_stride; - } - - flushing_cg.sync(); - // Final flush of output buffer - if (flushing_cg_counter[flushing_cg_id] > 0) { - view.flush_output_buffer(flushing_cg, - flushing_cg_counter[flushing_cg_id], - probe_output_buffer[flushing_cg_id], - contained_output_buffer[flushing_cg_id], - num_matches, - probe_output_begin, - contained_output_begin); - } -} -} // namespace detail -} // namespace cuco diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index 39e23f671..f0b3d4330 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ * limitations under the License. */ +#include #include #include -#include #include #include @@ -26,7 +26,6 @@ #include namespace cuco { -namespace experimental { template {}, impl_->storage_ref()}; } -} // namespace experimental - -template -static_multimap::static_multimap( - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - cudaStream_t stream, - Allocator const& alloc) - : capacity_{cuco::detail::get_valid_capacity( - capacity)}, - empty_key_sentinel_{empty_key_sentinel.value}, - empty_value_sentinel_{empty_value_sentinel.value}, - allocator_{alloc}, - delete_slots_{allocator_, capacity_, cuda::stream_ref{stream}}, - slots_{allocator_.allocate(capacity_, cuda::stream_ref{stream}), delete_slots_} -{ - auto constexpr block_size = 128; - auto constexpr stride = 4; - auto const grid_size = (get_capacity() + stride * block_size - 1) / (stride * block_size); - - detail::initialize<<>>( - slots_.get(), empty_key_sentinel_, empty_value_sentinel_, get_capacity()); -} - -template -template -void static_multimap::insert(InputIt first, - InputIt last, - cudaStream_t stream) -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); - - detail::insert - <<>>(first, num_keys, view); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); -} - -template -template -void static_multimap::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_mutable_view(); - - detail::insert_if_n - <<>>(first, stencil, num_keys, view, pred); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); -} - -template -template -void static_multimap::contains( - InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr is_pair_contains = false; - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - auto view = get_device_view(); - - detail::contains - <<>>(first, num_keys, output_begin, view, key_equal); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); -} - -template -template -void static_multimap::pair_contains( - InputIt first, InputIt last, OutputIt output_begin, PairEqual pair_equal, cudaStream_t stream) - const -{ - auto const num_pairs = cuco::detail::distance(first, last); - if (num_pairs == 0) { return; } - - auto constexpr is_pair_contains = true; - auto constexpr block_size = 128; - auto constexpr stride = 1; - auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - auto view = get_device_view(); - - detail::contains - <<>>(first, num_pairs, output_begin, view, pair_equal); - CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); -} - -template -template -std::size_t static_multimap::count( - InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } - - auto constexpr is_outer = false; - auto constexpr block_size = 128; - auto constexpr stride = 1; - - auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::count - <<>>(first, num_keys, counter.data(), view, key_equal); - - return counter.load_to_host(stream); -} - -template -template -std::size_t static_multimap::count_outer( - InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return 0; } - - auto constexpr is_outer = true; - auto constexpr block_size = 128; - auto constexpr stride = 1; - - auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::count - <<>>(first, num_keys, counter.data(), view, key_equal); - - return counter.load_to_host(stream); -} - -template -template -std::size_t static_multimap::pair_count( - InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const -{ - auto const num_pairs = cuco::detail::distance(first, last); - if (num_pairs == 0) { return 0; } - - auto constexpr is_outer = false; - auto constexpr block_size = 128; - auto constexpr stride = 1; - - auto view = get_device_view(); - auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::pair_count - <<>>(first, num_pairs, counter.data(), view, pair_equal); - - return counter.load_to_host(stream); -} - -template -template -std::size_t static_multimap::pair_count_outer( - InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const -{ - auto const num_pairs = cuco::detail::distance(first, last); - if (num_pairs == 0) { return 0; } - - auto constexpr is_outer = true; - auto constexpr block_size = 128; - auto constexpr stride = 1; - - auto view = get_device_view(); - auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::pair_count - <<>>(first, num_pairs, counter.data(), view, pair_equal); - - return counter.load_to_host(stream); -} - -template -template -OutputIt static_multimap::retrieve( - InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return output_begin; } - - // Using per-warp buffer for vector loads and per-CG buffer for scalar loads - constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto is_outer = false; - - auto view = get_device_view(); - auto const flushing_cg_size = [&]() { - if constexpr (uses_vector_load()) { return warp_size(); } - return cg_size(); - }(); - - auto const grid_size = detail::grid_size(num_keys, cg_size()); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::retrieve - <<>>( - first, num_keys, output_begin, counter.data(), view, key_equal); - - return output_begin + counter.load_to_host(stream); -} - -template -template -OutputIt static_multimap::retrieve_outer( - InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const -{ - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return output_begin; } - - // Using per-warp buffer for vector loads and per-CG buffer for scalar loads - constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto is_outer = true; - - auto view = get_device_view(); - auto const flushing_cg_size = [&]() { - if constexpr (uses_vector_load()) { return warp_size(); } - return cg_size(); - }(); - - auto const grid_size = detail::grid_size(num_keys, cg_size()); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::retrieve - <<>>( - first, num_keys, output_begin, counter.data(), view, key_equal); - - return output_begin + counter.load_to_host(stream); -} - -template -template -std::pair -static_multimap::pair_retrieve( - InputIt first, - InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal, - cudaStream_t stream) const -{ - auto const num_pairs = cuco::detail::distance(first, last); - if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } - - // Using per-warp buffer for vector loads and per-CG buffer for scalar loads - constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; - constexpr auto is_outer = false; - constexpr auto stride = 1; - - auto view = get_device_view(); - auto const flushing_cg_size = [&]() { - if constexpr (uses_vector_load()) { return warp_size(); } - return cg_size(); - }(); - auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::pair_retrieve - <<>>(first, - num_pairs, - probe_output_begin, - contained_output_begin, - counter.data(), - view, - pair_equal); - - auto const h_count = counter.load_to_host(stream); - return {probe_output_begin + h_count, contained_output_begin + h_count}; -} - -template -template -std::pair -static_multimap::pair_retrieve_outer( - InputIt first, - InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal, - cudaStream_t stream) const -{ - auto const num_pairs = cuco::detail::distance(first, last); - if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } - - // Using per-warp buffer for vector loads and per-CG buffer for scalar loads - constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; - constexpr auto is_outer = true; - constexpr auto stride = 1; - - auto view = get_device_view(); - auto const flushing_cg_size = [&]() { - if constexpr (uses_vector_load()) { return warp_size(); } - return cg_size(); - }(); - auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - - auto counter = detail::counter_storage{allocator_, stream}; - counter.reset(stream); - - detail::pair_retrieve - <<>>(first, - num_pairs, - probe_output_begin, - contained_output_begin, - counter.data(), - view, - pair_equal); - - auto const h_count = counter.load_to_host(stream); - return {probe_output_begin + h_count, contained_output_begin + h_count}; -} - -template -template -__device__ __forceinline__ void -static_multimap::device_mutable_view::insert( - cooperative_groups::thread_block_tile g, - value_type const& insert_pair) noexcept -{ - impl_.template insert(g, insert_pair); -} - -template -template -__device__ __forceinline__ static_multimap::device_view -static_multimap::device_view::make_copy( - CG g, pair_atomic_type* const memory_to_use, device_view source_device_view) noexcept -{ -#if defined(CUCO_HAS_CUDA_BARRIER) - __shared__ cuda::barrier barrier; - if (g.thread_rank() == 0) { init(&barrier, g.size()); } - g.sync(); - - cuda::memcpy_async(g, - memory_to_use, - source_device_view.get_slots(), - sizeof(pair_atomic_type) * source_device_view.get_capacity(), - barrier); - - barrier.arrive_and_wait(); -#else - pair_atomic_type const* const slots_ptr = source_device_view.get_slots(); - for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) { - new (&memory_to_use[i].first) - atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)}; - new (&memory_to_use[i].second) - atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)}; - } - g.sync(); -#endif - - return device_view(memory_to_use, - source_device_view.get_capacity(), - source_device_view.get_empty_key_sentinel(), - source_device_view.get_empty_value_sentinel()); -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::flush_output_buffer( - CG g, - uint32_t const num_outputs, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin) noexcept -{ - impl_.flush_output_buffer(g, num_outputs, output_buffer, num_matches, output_begin); -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::flush_output_buffer( - CG g, - uint32_t const num_outputs, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin) noexcept -{ - impl_.flush_output_buffer(g, - num_outputs, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin); -} - -template -template -__device__ __forceinline__ bool -static_multimap::device_view::contains( - cooperative_groups::thread_block_tile g, - ProbeKey const& k, - KeyEqual key_equal) const noexcept -{ - constexpr bool is_pair_contains = false; - return impl_.template contains(g, k, key_equal); -} - -template -template -__device__ __forceinline__ bool -static_multimap::device_view::pair_contains( - cooperative_groups::thread_block_tile g, - ProbePair const& p, - PairEqual pair_equal) const noexcept -{ - constexpr bool is_pair_contains = true; - return impl_.template contains(g, p, pair_equal); -} - -template -template -__device__ __forceinline__ std::size_t -static_multimap::device_view::count( - cooperative_groups::thread_block_tile g, - Key const& k, - KeyEqual key_equal) noexcept -{ - constexpr bool is_outer = false; - return impl_.count(g, k, key_equal); -} - -template -template -__device__ __forceinline__ std::size_t -static_multimap::device_view::count_outer( - cooperative_groups::thread_block_tile g, - Key const& k, - KeyEqual key_equal) noexcept -{ - constexpr bool is_outer = true; - return impl_.count(g, k, key_equal); -} - -template -template -__device__ __forceinline__ std::size_t -static_multimap::device_view::pair_count( - cooperative_groups::thread_block_tile g, - value_type const& pair, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = false; - return impl_.pair_count(g, pair, pair_equal); -} - -template -template -__device__ __forceinline__ std::size_t -static_multimap::device_view::pair_count_outer( - cooperative_groups::thread_block_tile g, - value_type const& pair, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = true; - return impl_.pair_count(g, pair, pair_equal); -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::retrieve( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - Key const& k, - uint32_t* flushing_cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal) noexcept -{ - constexpr bool is_outer = false; - if constexpr (uses_vector_load()) { - impl_.template retrieve(flushing_cg, - probing_cg, - k, - flushing_cg_counter, - output_buffer, - num_matches, - output_begin, - key_equal); - } else // In the case of scalar load, flushing CG is the same as probing CG - { - impl_.template retrieve( - probing_cg, k, flushing_cg_counter, output_buffer, num_matches, output_begin, key_equal); - } -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::retrieve_outer( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - Key const& k, - uint32_t* flushing_cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal) noexcept -{ - constexpr bool is_outer = true; - if constexpr (uses_vector_load()) { - impl_.template retrieve(flushing_cg, - probing_cg, - k, - flushing_cg_counter, - output_buffer, - num_matches, - output_begin, - key_equal); - } else // In the case of scalar load, flushing CG is the same as probing CG - { - impl_.template retrieve( - probing_cg, k, flushing_cg_counter, output_buffer, num_matches, output_begin, key_equal); - } -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::pair_retrieve( - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = false; - impl_.pair_retrieve(probing_cg, - pair, - probe_key_begin, - probe_val_begin, - contained_key_begin, - contained_val_begin, - pair_equal); -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::pair_retrieve( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - uint32_t* flushing_cg_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = false; - if constexpr (uses_vector_load()) { - impl_.pair_retrieve(flushing_cg, - probing_cg, - pair, - flushing_cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } else // In the case of scalar load, flushing CG is the same as probing CG - { - impl_.pair_retrieve(probing_cg, - pair, - flushing_cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::pair_retrieve_outer( - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = true; - impl_.pair_retrieve(probing_cg, - pair, - probe_key_begin, - probe_val_begin, - contained_key_begin, - contained_val_begin, - pair_equal); -} - -template -template -__device__ __forceinline__ void -static_multimap::device_view::pair_retrieve_outer( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - uint32_t* flushing_cg_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept -{ - constexpr bool is_outer = true; - if constexpr (uses_vector_load()) { - impl_.pair_retrieve(flushing_cg, - probing_cg, - pair, - flushing_cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } else // In the case of scalar load, flushing CG is the same as probing CG - { - impl_.pair_retrieve(probing_cg, - pair, - flushing_cg_counter, - probe_output_buffer, - contained_output_buffer, - num_matches, - probe_output_begin, - contained_output_begin, - pair_equal); - } -} - -template -std::size_t static_multimap::get_size( - cudaStream_t stream) const noexcept -{ - auto begin = thrust::make_transform_iterator(raw_slots(), detail::slot_to_tuple{}); - auto filled = cuco::detail::slot_is_filled{get_empty_key_sentinel()}; - - return thrust::count_if(thrust::cuda::par.on(stream), begin, begin + get_capacity(), filled); -} - -template -float static_multimap::get_load_factor( - cudaStream_t stream) const noexcept -{ - auto size = get_size(stream); - return static_cast(size) / static_cast(capacity_); -} } // namespace cuco diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 6bda77367..f11f9bed9 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,8 @@ #include #include -#include #include #include -#include #include #include diff --git a/include/cuco/probe_sequences.cuh b/include/cuco/probe_sequences.cuh deleted file mode 100644 index 89840051f..000000000 --- a/include/cuco/probe_sequences.cuh +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cuco::legacy { - -/** - * @brief Public linear probing scheme class. - * - * Linear probing is efficient when few collisions are present. Performance hints: - * - Use linear probing when collisions are rare. e.g. low occupancy or low multiplicity. - * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise. - * - * `Hash` should be callable object type. - * - * @tparam CGSize Size of CUDA Cooperative Groups - * @tparam Hash Unary callable type - */ -template -class linear_probing : public detail::probe_sequence_base { - public: - using probe_sequence_base_type = - detail::probe_sequence_base; ///< The base probe scheme type - using probe_sequence_base_type::cg_size; - using probe_sequence_base_type::vector_width; - - /// Type of implementation details - template - using impl = detail::linear_probing_impl; -}; - -/** - * - * @brief Public double hashing scheme class. - * - * Default probe sequence for `cuco::static_multimap`. Double hashing shows superior - * performance when dealing with high multiplicty and/or high occupancy use cases. Performance - * hints: - * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise. - * - * `Hash1` and `Hash2` should be callable object type. - * - * @tparam CGSize Size of CUDA Cooperative Groups - * @tparam Hash1 Unary callable type - * @tparam Hash2 Unary callable type - */ -template -class double_hashing : public detail::probe_sequence_base { - public: - using probe_sequence_base_type = - detail::probe_sequence_base; ///< The base probe scheme type - using probe_sequence_base_type::cg_size; - using probe_sequence_base_type::vector_width; - - /// Type of implementation details - template - using impl = detail::double_hashing_impl; -}; - -} // namespace cuco::legacy diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 0fdefdf1c..449e6428a 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,9 +18,7 @@ #include #include -#include #include -#include #include #include #include @@ -34,15 +32,10 @@ #include #endif -#include - #include #include -#include -#include namespace cuco { -namespace experimental { /** * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports * equivalent keys. @@ -944,1337 +937,7 @@ class static_multimap { std::unique_ptr impl_; ///< Static map implementation mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload }; -} // namespace experimental - -/** - * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports - * equivalent keys. - * - * Allows constant time concurrent inserts or concurrent find operations from threads in device - * code. Concurrent insert/find is allowed only when - * static_multimap::supports_concurrent_insert_find() is true. - * - * Current limitations: - * - Requires keys and values where `cuco::is_bitwise_comparable_v` is true - * - Comparisons against the "sentinel" values will always be done with bitwise comparisons - * Therefore, the objects must have unique, bitwise object representations (e.g., no padding bits). - * - Does not support erasing keys - * - Capacity is fixed and will not grow automatically - * - Requires the user to specify sentinel values for both key and mapped value - * to indicate empty slots - * - Concurrent insert/find is only supported when - * static_multimap::supports_concurrent_insert_find() is true` - * - * The `static_multimap` supports two types of operations: - * - Host-side "bulk" operations - * - Device-side "singular" operations - * - * The host-side bulk operations include `insert`, `contains`, `count`, `retrieve` and their - * variants. These APIs should be used when there are a large number of keys to insert or lookup in - * the map. For example, given a range of keys specified by device-accessible iterators, the bulk - * `insert` function will insert all keys into the map. - * - * The singular device-side operations allow individual threads to perform - * independent operations (e.g. `insert`, etc.) from device code. These - * operations are accessed through non-owning, trivially copyable "view" types: - * `device_view` and `device_mutable_view`. The `device_view` class is an - * immutable view that allows only non-modifying operations such as `count` or - * `contains`. The `device_mutable_view` class only allows `insert` operations. - * The two types are separate to prevent erroneous concurrent insert/find - * operations. - * - * By default, when querying for a Key `k` in operations like `count` or `retrieve`, if `k` is not - * present in the map, it will not contribute to the output. Query APIs with the `_outer` suffix - * will include non-matching keys in the output. See the relevant API documentation for more - * information. - * - * Typical associative container query APIs like `retrieve` look up values by solely by key, e.g., - * `count` for a Key `k` will count all values whose associated key `k'` matches `k` as determined - * by `key_equal(k, k')`. In some cases, one may want to consider both key _and_ value when - * determining if a key-value pair should contribute to the output. `static_multimap` supports this - * use case with APIs prefixed with `pair_`, e.g., `pair_count` is given a key-value pair - * `{k,v}` and only counts key-value pairs, `{k', v'}`, in the map where `pair_equal({k,v}, {k', - * v'})` is true. See the relevant API documentation for more information. - * - * Example: - * \code{.cpp} - * int empty_key_sentinel = -1; - * int empty_value_sentinel = -1; - * - * // Constructs a multimap with 100,000 slots using -1 and -1 as the empty key/value - * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, - * // for an load factor of 50%. - * static_multimap m{100'000, empty_key_sentinel, empty_value_sentinel}; - * - * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::device_vector> pairs(50,000); - * thrust::transform(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(pairs.size()), - * pairs.begin(), - * []__device__(auto i){ return cuco::pair{i,i}; }; - * - * // Inserts all pairs into the map - * m.insert(pairs.begin(), pairs.end()); - * - * // Get a `device_view` and passes it to a kernel where threads may perform - * // `contains/count/retrieve` lookups - * kernel<<<...>>>(m.get_device_view()); - * \endcode - * - * - * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` - * @tparam Value Type of the mapped values. Requires `cuco::is_bitwise_comparable_v` - * @tparam Scope The scope in which multimap operations will be performed by - * individual threads - * @tparam ProbeSequence Probe sequence chosen between `cuco::legacy::linear_probing` - * and `cuco::legacy::double_hashing`. (see `probe_sequences.cuh`) - * @tparam Allocator Type of allocator used for device storage - */ -template , - class ProbeSequence = cuco::legacy::double_hashing<8, cuco::default_hash_function>> -class static_multimap { - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - - static_assert( - cuco::is_bitwise_comparable_v, - "Value type must have unique object representations or have been explicitly declared as safe " - "for bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - - static_assert(std::is_base_of_v, - ProbeSequence>, - "ProbeSequence must be a specialization of either cuco::legacy::double_hashing or " - "cuco::legacy::linear_probing."); - - public: - using value_type = cuco::pair; ///< Type of key/value pairs - using key_type = Key; ///< Key type - using mapped_type = Value; ///< Type of mapped values - using size_type = std::size_t; ///< Size type - using atomic_key_type = cuda::atomic; ///< Type of atomic keys - using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values - using pair_atomic_type = - cuco::pair; ///< Pair type of atomic key and atomic mapped value - using allocator_type = typename std::allocator_traits::template rebind_alloc< - pair_atomic_type>; ///< Type of the allocator to (de)allocate slots - using probe_sequence_type = - cuco::legacy::detail::probe_sequence; ///< Probe scheme type - - static_multimap(static_multimap const&) = delete; - static_multimap& operator=(static_multimap const&) = delete; - - static_multimap(static_multimap&&) = default; ///< Move constructor - - /** - * @brief Replaces the contents of the map with another map. - * - * @return Reference of the current map object - */ - static_multimap& operator=(static_multimap&&) = default; - ~static_multimap() = default; - - /** - * @brief Indicate if concurrent insert/find is supported for the key/value types. - * - * @return Boolean indicating if concurrent insert/find is supported. - */ - __host__ __device__ __forceinline__ static constexpr bool - supports_concurrent_insert_find() noexcept - { - return cuco::detail::is_packable(); - } - - /** - * @brief The size of the CUDA cooperative thread group. - * - * @return The CG size. - */ - __host__ __device__ __forceinline__ static constexpr uint32_t cg_size() noexcept - { - return ProbeSequence::cg_size; - } - - /** - * @brief Construct a statically-sized map with the specified initial capacity, - * sentinel values and CUDA stream. - * - * The capacity of the map is fixed. Insert operations will not automatically - * grow the map. Attempting to insert more unique keys than the capacity of - * the map results in undefined behavior. - * - * Performance begins to degrade significantly beyond a load factor of ~70%. - * For best performance, choose a capacity that will keep the load factor - * below 70%. E.g., if inserting `N` unique keys, choose a capacity of - * `N * (1/0.7)`. - * - * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and - * undefined behavior results from attempting to insert any key/value pair - * that contains either. - * - * @param capacity The total number of slots in the map - * @param empty_key_sentinel The reserved key value for empty slots - * @param empty_value_sentinel The reserved mapped value for empty slots - * @param stream CUDA stream used to initialize the map - * @param alloc Allocator used for allocating device storage - */ - static_multimap(std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel, - cudaStream_t stream = 0, - Allocator const& alloc = Allocator{}); - - /** - * @brief Inserts all key/value pairs in the range `[first, last)`. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stream CUDA stream used for insert - */ - template - void insert(InputIt first, InputIt last, cudaStream_t stream = 0); - - /** - * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` - * of the corresponding stencil returns true. - * - * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @tparam StencilIt Device accessible random access iterator whose value_type is - * convertible to Predicate's argument type - * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and - * argument type is convertible from std::iterator_traits::value_type. - * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs - * @param stencil Beginning of the stencil sequence - * @param pred Predicate to test on every element in the range `[stencil, stencil + - * std::distance(first, last))` - * @param stream CUDA stream used for insert - */ - template - void insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = 0); - - /** - * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. - * - * Stores `true` or `false` to `(output + i)` indicating if the key `*(first + i)` exists in the - * map. - * - * ProbeSequence hashers should be callable with both - * std::iterator_traits::value_type and Key type. - * std::invoke_result::value_type, Key> must be - * well-formed. - * - * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from `bool` - * @tparam KeyEqual Binary callable type used to compare two keys for equality - * - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the output sequence indicating whether each key is present - * @param key_equal The binary function to compare two keys for equality - * @param stream CUDA stream used for contains - */ - template > - void contains(InputIt first, - InputIt last, - OutputIt output_begin, - KeyEqual key_equal = KeyEqual{}, - cudaStream_t stream = 0) const; - - /** - * @brief Indicates whether the pairs in the range `[first, last)` are contained in the map. - * - * Stores `true` or `false` to `(output + i)` indicating if the pair `*(first + i)` exists in - * the map. - * - * ProbeSequence hashers should be callable with both - * std::iterator_traits::value_type::first_type - * and Key type. std::invoke_result::value_type::first_type, Key> - * must be well-formed. - * - * @tparam InputIt Device accessible random access input iterator - * @tparam OutputIt Device accessible output iterator assignable from `bool` - * @tparam PairEqual Binary callable type used to compare input pair and slot content for equality - * - * @param first Beginning of the sequence of pairs - * @param last End of the sequence of pairs - * @param output_begin Beginning of the output sequence indicating whether each pair is present - * @param pair_equal The binary function to compare input pair and slot content for equality - * @param stream CUDA stream used for contains - */ - template - void pair_contains(InputIt first, - InputIt last, - OutputIt output_begin, - PairEqual pair_equal, - cudaStream_t stream = 0) const; - - /** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap. - * - * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by - * `key_equal(k, k')` and returns the sum of all matches for all keys. - * - * @tparam Input Device accessible input iterator whose `value_type` is convertible to `key_type` - * @tparam KeyEqual Binary callable - * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count - * @param stream CUDA stream used for count - * @param key_equal Binary function to compare two keys for equality - * @return The sum of total occurrences of all keys in `[first, last)` - */ - template > - std::size_t count(InputIt first, - InputIt last, - cudaStream_t stream = 0, - KeyEqual key_equal = KeyEqual{}) const; - - /** - * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap. - * - * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by - * `key_equal(k, k')` and returns the sum of all matches for all keys. If `k` does not have any - * matches, it contributes 1 to the final sum. - * - * @tparam Input Device accessible input iterator whose `value_type` is convertible to `key_type` - * @tparam KeyEqual Binary callable - * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count - * @param stream CUDA stream used for count_outer - * @param key_equal Binary function to compare two keys for equality - * @return The sum of total occurrences of all keys in `[first, last)` where keys without matches - * are considered to have a single occurrence. - */ - template > - std::size_t count_outer(InputIt first, - InputIt last, - cudaStream_t stream = 0, - KeyEqual key_equal = KeyEqual{}) const; - - /** - * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap. - * - * For key-value pair, `kv = *(first + i)`, counts all matching key-value pairs, `kv'`, as - * determined by `pair_equal(kv, kv')` and returns the sum of all matches for all key-value pairs. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @tparam PairEqual Binary callable - * @param first Beginning of the sequence of pairs to count - * @param last End of the sequence of pairs to count - * @param pair_equal Binary function to compare two pairs for equality - * @param stream CUDA stream used for pair_count - * @return The sum of total occurrences of all pairs in `[first, last)` - */ - template - std::size_t pair_count(InputIt first, - InputIt last, - PairEqual pair_equal, - cudaStream_t stream = 0) const; - - /** - * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap. - * - * For key-value pair, `kv = *(first + i)`, counts all matching key-value pairs, `kv'`, as - * determined by `pair_equal(kv, kv')` and returns the sum of all matches for all key-value pairs. - * if `kv` does not have any matches, it contributes 1 to the final sum. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @tparam PairEqual Binary callable - * @param first Beginning of the sequence of pairs to count - * @param last End of the sequence of pairs to count - * @param pair_equal Binary function to compare two pairs for equality - * @param stream CUDA stream used for pair_count_outer - * @return The sum of total occurrences of all pairs in `[first, last)` where a key-value pair - * without a match is considered to have a single occurrence - */ - template - std::size_t pair_count_outer(InputIt first, - InputIt last, - PairEqual pair_equal, - cudaStream_t stream = 0) const; - - /** - * @brief Retrieves all the values corresponding to all keys in the range `[first, last)`. - * - * If key `k = *(first + i)` exists in the map, copies `k` and all associated values to - * unspecified locations in `[output_begin, output_end)`. Else, does nothing. - * - * Behavior is undefined if the size of the output range exceeds `std::distance(output_begin, - * output_end)`. Use `count()` to determine the size of the output range. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of key/value pairs retrieved for each key - * @param stream CUDA stream used for retrieve - * @param key_equal The binary function to compare two keys for equality - * @return The iterator indicating the last valid key/value pairs in the output - */ - template > - OutputIt retrieve(InputIt first, - InputIt last, - OutputIt output_begin, - cudaStream_t stream = 0, - KeyEqual key_equal = KeyEqual{}) const; - - /** - * @brief Retrieves all the matches corresponding to all keys in the range `[first, last)`. - * - * If key `k = *(first + i)` exists in the map, copies `k` and all associated values to - * unspecified locations in `[output_begin, output_end)`. Else, copies `k` and - * `empty_value_sentinel`. - * - * Behavior is undefined if the size of the output range exceeds `std::distance(output_begin, - * output_end)`. Use `count_outer()` to determine the size of the output range. - * - * @tparam InputIt Device accessible input iterator whose `value_type` is - * convertible to the map's `key_type` - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys - * @param output_begin Beginning of the sequence of key/value pairs retrieved for each key - * @param stream CUDA stream used for retrieve_outer - * @param key_equal The binary function to compare two keys for equality - * @return The iterator indicating the last valid key/value pairs in the output - */ - template > - OutputIt retrieve_outer(InputIt first, - InputIt last, - OutputIt output_begin, - cudaStream_t stream = 0, - KeyEqual key_equal = KeyEqual{}) const; - - /** - * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. - * - * The `pair_` prefix indicates that the input data type is convertible to the map's - * `value_type`. If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is - * stored to `probe_output_begin`, and slot[j] is stored to `contained_output_begin`. - * - * Behavior is undefined if the size of the output range exceeds - * `std::distance(probe_output_begin, probe_output_end)` (or - * `std::distance(contained_output_begin, contained_output_end)`). Use - * `pair_count()` to determine the size of the output range. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @param first Beginning of the sequence of pairs - * @param last End of the sequence of pairs - * @param probe_output_begin Beginning of the sequence of the matched probe pairs - * @param contained_output_begin Beginning of the sequence of the matched contained pairs - * @param pair_equal The binary function to compare two pairs for equality - * @param stream CUDA stream used for pair_retrieve - * @return Pair of iterators pointing to the last elements in the output - */ - template - std::pair pair_retrieve(InputIt first, - InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal, - cudaStream_t stream = 0) const; - - /** - * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`. - * - * The `pair_` prefix indicates that the input data type is convertible to the map's `value_type`. - * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to - * `probe_output_begin`, and slot[j] is stored to `contained_output_begin`. If *(first+i) doesn't - * have matches in the map, copies *(first + i) in `probe_output_begin` and a pair of - * `empty_key_sentinel` and `empty_value_sentinel` in `contained_output_begin`. - * - * Behavior is undefined if the size of the output range exceeds - * `std::distance(probe_output_begin, probe_output_end)` (or - * `std::distance(contained_output_begin, contained_output_end)`). Use - * `pair_count()` to determine the size of the output range. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * static_multimap::value_type> is `true` - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @param first Beginning of the sequence of pairs - * @param last End of the sequence of pairs - * @param probe_output_begin Beginning of the sequence of the matched probe pairs - * @param contained_output_begin Beginning of the sequence of the matched contained pairs - * @param pair_equal The binary function to compare two pairs for equality - * @param stream CUDA stream used for pair_retrieve_outer - * @return Pair of iterators pointing to the last elements in the output - */ - template - std::pair pair_retrieve_outer(InputIt first, - InputIt last, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal, - cudaStream_t stream = 0) const; - - private: - /** - * @brief Indicates if vector-load is used. - * - * Users have no explicit control on whether vector-load is used. - * - * @return Boolean indicating if vector-load is used. - */ - static __host__ __device__ constexpr bool uses_vector_load() noexcept - { - return cuco::detail::is_packable(); - } - - /** - * @brief Returns the number of pairs loaded with each vector-load - */ - static __host__ __device__ constexpr uint32_t vector_width() noexcept - { - return ProbeSequence::vector_width(); - } - - /** - * @brief Returns the warp size. - */ - static __host__ __device__ constexpr uint32_t warp_size() noexcept { return 32u; } - - /** - * @brief Custom deleter for unique pointer of slots. - */ - struct slot_deleter { - slot_deleter(allocator_type& a, size_t& c, cuda::stream_ref s) - : allocator{a}, capacity{c}, stream{s} - { - } - - slot_deleter(slot_deleter const&) = default; - - void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity, stream); } - - allocator_type& allocator; - size_t& capacity; - cuda::stream_ref stream; - }; - - class device_view_impl_base; - class device_mutable_view_impl; - class device_view_impl; - - template - class device_view_base { - protected: - // Import member type definitions from `static_multimap` - using value_type = value_type; - using key_type = Key; - using mapped_type = Value; - using pair_atomic_type = pair_atomic_type; - using iterator = pair_atomic_type*; - using const_iterator = pair_atomic_type const*; - using probe_sequence_type = probe_sequence_type; - - __host__ __device__ device_view_base(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : impl_{slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value} - { - } - - public: - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type* get_slots() noexcept { return impl_.get_slots(); } - - /** - * @brief Gets slots array. - * - * @return Slots array - */ - __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept - { - return impl_.get_slots(); - } - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept - { - return impl_.get_capacity(); - } - - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept - { - return impl_.get_empty_key_sentinel(); - } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept - { - return impl_.get_empty_value_sentinel(); - } - - protected: - ViewImpl impl_; - }; // class device_view_base - - public: - /** - * @brief Mutable, non-owning view-type that may be used in device code to - * perform singular inserts into the map. - * - * `device_mutable_view` is trivially-copyable and is intended to be passed by - * value. - * - * Example: - * \code{.cpp} - * cuco::static_multimap m{100'000, -1, -1}; - * - * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}} - * thrust::for_each(thrust::make_counting_iterator(0), - * thrust::make_counting_iterator(50'000), - * [map = m.get_device_mutable_view()] - * __device__ (auto i) mutable { - * map.insert(cuco::pair{i,i}); - * }); - * \endcode - */ - class device_mutable_view : public device_view_base { - public: - using view_base_type = - device_view_base; ///< Base view implementation type - using value_type = typename view_base_type::value_type; ///< Type of key/value pairs - using key_type = typename view_base_type::key_type; ///< Key type - using mapped_type = typename view_base_type::mapped_type; ///< Type of the mapped values - using iterator = - typename view_base_type::iterator; ///< Type of the forward iterator to `value_type` - using const_iterator = - typename view_base_type::const_iterator; ///< Type of the forward iterator to `const - ///< value_type` - - /** - * @brief Construct a mutable view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty - * slots - * @param empty_value_sentinel The reserved value for mapped values to - * represent empty slots - */ - __host__ __device__ device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Inserts the specified key/value pair into the map. - * - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group that performs the insert - * @param insert_pair The pair to insert - */ - template - __device__ __forceinline__ void insert( - cooperative_groups::thread_block_tile g, - value_type const& insert_pair) noexcept; - - private: - using device_view_base::impl_; - }; // class device mutable view - - /** - * @brief Non-owning view-type that may be used in device code to - * perform singular find and contains operations for the map. - * - * `device_view` is trivially-copyable and is intended to be passed by - * value. - * - */ - class device_view : public device_view_base { - public: - using view_base_type = device_view_base; ///< Base view implementation type - using value_type = typename view_base_type::value_type; ///< Type of key/value pairs - using key_type = typename view_base_type::key_type; ///< Key type - using mapped_type = typename view_base_type::mapped_type; ///< Type of the mapped values - using iterator = - typename view_base_type::iterator; ///< Type of the forward iterator to `value_type` - using const_iterator = - typename view_base_type::const_iterator; ///< Type of the forward iterator to `const - ///< value_type` - - /** - * @brief Construct a view of the first `capacity` slots of the - * slots array pointed to by `slots`. - * - * @param slots Pointer to beginning of initialized slots array - * @param capacity The number of slots viewed by this object - * @param empty_key_sentinel The reserved value for keys to represent empty - * slots - * @param empty_value_sentinel The reserved value for mapped values to - * represent empty slots - */ - __host__ __device__ device_view(pair_atomic_type* slots, - std::size_t capacity, - empty_key empty_key_sentinel, - empty_value empty_value_sentinel) noexcept - : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} - { - } - - /** - * @brief Makes a copy of given `device_view` using non-owned memory. - * - * This function is intended to be used to create shared memory copies of small static maps, - * although global memory can be used as well. - * - * @tparam CG The type of the cooperative thread group - * @param g The cooperative thread group used to copy the slots - * @param source_device_view `device_view` to copy from - * @param memory_to_use Array large enough to support `capacity` elements. Object does not - * take the ownership of the memory - * @return Copy of passed `device_view` - */ - template - __device__ __forceinline__ static device_view make_copy( - CG g, pair_atomic_type* const memory_to_use, device_view source_device_view) noexcept; - - /** - * @brief Flushes per-CG buffer into the output sequence. - * - * A given CUDA Cooperative Group, `g`, loads `num_outputs` key-value pairs from `output_buffer` - * and writes them into global memory in a coalesced fashion. CG-wide `memcpy_sync` is used if - * `thrust::is_contiguous_iterator_v` returns true. All threads of `g` must be active - * due to implicit CG-wide synchronization during flushing. - * - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @param g The Cooperative Group used to flush output buffer - * @param num_outputs Number of valid output in the buffer - * @param output_buffer Buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - */ - template - __device__ __forceinline__ void flush_output_buffer(CG g, - uint32_t const num_outputs, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin) noexcept; - - /** - * @brief Flushes per-CG buffer into the output sequences. - * - * A given CUDA Cooperative Group, `g`, loads `num_outputs` elements from `probe_output_buffer` - * and `num_outputs` elements from `contained_output_buffer`, then writes them into global - * memory started from `probe_output_begin` and `contained_output_begin` respectively. All - * threads of `g` must be active due to implicit CG-wide synchronization during flushing. - * - * @tparam CG Cooperative Group type - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @param g The Cooperative Group used to flush output buffer - * @param num_outputs Number of valid output in the buffer - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - */ - template - __device__ __forceinline__ void flush_output_buffer(CG g, - uint32_t const num_outputs, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin) noexcept; - - /** - * @brief Indicates whether the key `k` exists in the map. - * - * If the key `k` was inserted into the map, `contains` returns - * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to - * to leverage multiple threads to perform a single `contains` operation. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `contains` at moderate to high load factors. - * - * ProbeSequence hashers should be callable with both ProbeKey and Key type. - * `std::invoke_result` must be well-formed. - * - * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must - * also be true. - * - * @tparam ProbeKey Probe key type - * @tparam KeyEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the contains operation - * @param k The key to search for - * @param key_equal The binary callable used to compare two keys - * for equality - * @return A boolean indicating whether the key/value pair - * containing `k` was inserted - */ - template , - typename ParentCG = void> - __device__ __forceinline__ bool contains( - cooperative_groups::thread_block_tile g, - ProbeKey const& k, - KeyEqual key_equal = KeyEqual{}) const noexcept; - - /** - * @brief Indicates whether the pair `p` exists in the map. - * - * If the pair `p` was inserted into the map, `contains` returns - * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to - * to leverage multiple threads to perform a single `contains` operation. This provides a - * significant boost in throughput compared to the non Cooperative Group - * `contains` at moderate to high load factors. - * - * ProbeSequence hashers should be callable with both ProbePair::first_type and Key type. - * `std::invoke_result` must be well-formed. - * - * If `pair_equal(p, slot_content)` returns true, `hash(p.first) == hash(slot_key)` must - * also be true. - * - * @tparam ProbePair Probe pair type - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the contains operation - * @param p The pair to search for - * @param pair_equal The binary callable used to compare input pair and slot content - * for equality - * @return A boolean indicating whether the input pair was inserted in the map - */ - template - __device__ __forceinline__ bool pair_contains( - cooperative_groups::thread_block_tile g, - ProbePair const& p, - PairEqual pair_equal) const noexcept; - - /** - * @brief Counts the occurrence of a given key contained in multimap. - * - * For a given key, `k`, counts all matching keys, `k'`, as determined by `key_equal(k, k')` and - * returns the sum of all matches for `k`. - * - * @tparam KeyEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the count operation - * @param k The key to search for - * @param key_equal The binary callable used to compare two keys - * for equality - * @return Number of matches found by the current thread - */ - template , typename ParentCG = void> - __device__ __forceinline__ std::size_t count( - cooperative_groups::thread_block_tile g, - Key const& k, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Counts the occurrence of a given key contained in multimap. If no - * matches can be found for a given key, the corresponding occurrence is 1. - * - * For a given key, `k`, counts all matching keys, `k'`, as determined by `key_equal(k, k')` and - * returns the sum of all matches for `k`. If `k` does not have any matches, returns 1. - * - * @tparam KeyEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the count operation - * @param k The key to search for - * @param key_equal The binary callable used to compare two keys - * for equality - * @return Number of matches found by the current thread - */ - template , typename ParentCG = void> - __device__ __forceinline__ std::size_t count_outer( - cooperative_groups::thread_block_tile g, - Key const& k, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Counts the occurrence of a given key/value pair contained in multimap. - * - * For a given pair, `p`, counts all matching pairs, `p'`, as determined by `pair_equal(p, p')` - * and returns the sum of all matches for `p`. - * - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the pair_count operation - * @param pair The pair to search for - * @param pair_equal The binary callable used to compare two pairs - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ std::size_t pair_count( - cooperative_groups::thread_block_tile g, - value_type const& pair, - PairEqual pair_equal) noexcept; - - /** - * @brief Counts the occurrence of a given key/value pair contained in multimap. - * If no matches can be found for a given key, the corresponding occurrence is 1. - * - * For a given pair, `p`, counts all matching pairs, `p'`, as determined by `pair_equal(p, p')` - * and returns the sum of all matches for `p`. If `p` does not have any matches, returns 1. - * - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param g The Cooperative Group used to perform the pair_count operation - * @param pair The pair to search for - * @param pair_equal The binary callable used to compare two pairs - * for equality - * @return Number of matches found by the current thread - */ - template - __device__ __forceinline__ std::size_t pair_count_outer( - cooperative_groups::thread_block_tile g, - value_type const& pair, - PairEqual pair_equal) noexcept; - - /** - * @brief Retrieves all the matches of a given key contained in multimap with per-flushing-CG - * shared memory buffer. - * - * For key `k` existing in the map, copies `k` and all associated values to unspecified - * locations in `[output_begin, output_end)`. - * - * @tparam buffer_size Size of the output buffer - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param k The key to search for - * @param flushing_cg_counter Pointer to flushing_cg counter - * @param output_buffer Shared memory buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - * @param key_equal The binary callable used to compare two keys - * for equality - */ - template , - typename ParentCG = void> - __device__ __forceinline__ void retrieve( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - Key const& k, - uint32_t* flushing_cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Retrieves all the matches of a given key contained in multimap with per-flushing-CG - * shared memory buffer. - * - * For key `k` existing in the map, copies `k` and all associated values to unspecified - * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and - * `empty_value_sentinel()` into the output. - * - * @tparam buffer_size Size of the output buffer - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam atomicT Type of atomic storage - * @tparam OutputIt Device accessible output iterator whose `value_type` is - * constructible from the map's `value_type` - * @tparam KeyEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param k The key to search for - * @param flushing_cg_counter Pointer to flushing_cg counter - * @param output_buffer Shared memory buffer of the key/value pair sequence - * @param num_matches Size of the output sequence - * @param output_begin Beginning of the output sequence of key/value pairs - * @param key_equal The binary callable used to compare two keys - * for equality - */ - template , - typename ParentCG = void> - __device__ __forceinline__ void retrieve_outer( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - Key const& k, - uint32_t* flushing_cg_counter, - value_type* output_buffer, - atomicT* num_matches, - OutputIt output_begin, - KeyEqual key_equal = KeyEqual{}) noexcept; - - /** - * @brief Retrieves all the matches of a given pair - * - * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p, - * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, - * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. - * - * Concurrent reads or writes to any of the output ranges results in undefined behavior. - * - * Behavior is undefined if the extent of any of the output ranges is less than `n`. - * - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Key` type. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Value` type. - * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from - * the map's `key_type`. - * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from - * the map's `mapped_type`. - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param probe_key_begin Beginning of the output sequence of the matched probe keys - * @param probe_val_begin Beginning of the output sequence of the matched probe values - * @param contained_key_begin Beginning of the output sequence of the matched contained keys - * @param contained_val_begin Beginning of the output sequence of the matched contained values - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve( - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept; - - /** - * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG - * shared memory buffer. - * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. - * - * @tparam buffer_size Size of the output buffer - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param warp_counter Pointer to the warp counter - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - uint32_t* warp_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept; - - /** - * @brief Retrieves all the matches of a given pair - * - * For pair `p` with `n = pair_count_outer(cg, p, pair_equal)` matching pairs, if `pair_equal(p, - * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, - * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an - * unspecified value of `j` where `0 <= j < n`. If `p` does not have any matches, stores - * `probe_key_begin[0] = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] = - * empty_key_sentinel`, and `contained_val_begin[0] = empty_value_sentinel`. - * - * Concurrent reads or writes to any of the output ranges results in undefined behavior. - * - * Behavior is undefined if the extent of any of the output ranges is less than `n`. - * - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Key` type. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * `pair`'s `Value` type. - * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from - * the map's `key_type`. - * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from - * the map's `mapped_type`. - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param probe_key_begin Beginning of the output sequence of the matched probe keys - * @param probe_val_begin Beginning of the output sequence of the matched probe values - * @param contained_key_begin Beginning of the output sequence of the matched contained keys - * @param contained_val_begin Beginning of the output sequence of the matched contained values - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve_outer( - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - OutputIt1 probe_key_begin, - OutputIt2 probe_val_begin, - OutputIt3 contained_key_begin, - OutputIt4 contained_val_begin, - PairEqual pair_equal) noexcept; - - /** - * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG - * shared memory buffer. - * - * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations - * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in - * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies - * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output. - * - * @tparam buffer_size Size of the output buffer - * @tparam FlushingCG Type of Cooperative Group used to flush output buffer - * @tparam atomicT Type of atomic storage - * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from - * `InputIt`s `value_type`. - * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from - * the map's `value_type`. - * @tparam PairEqual Binary callable type - * @tparam ParentCG Type of parent Cooperative Group - * - * @param flushing_cg The Cooperative Group used to flush output buffer - * @param probing_cg The Cooperative Group used to retrieve - * @param pair The pair to search for - * @param flushing_cg_counter Pointer to the flushing CG counter - * @param probe_output_buffer Buffer of the matched probe pair sequence - * @param contained_output_buffer Buffer of the matched contained pair sequence - * @param num_matches Size of the output sequence - * @param probe_output_begin Beginning of the output sequence of the matched probe pairs - * @param contained_output_begin Beginning of the output sequence of the matched contained - * pairs - * @param pair_equal The binary callable used to compare two pairs for equality - */ - template - __device__ __forceinline__ void pair_retrieve_outer( - FlushingCG flushing_cg, - cooperative_groups::thread_block_tile probing_cg, - value_type const& pair, - uint32_t* flushing_cg_counter, - value_type* probe_output_buffer, - value_type* contained_output_buffer, - atomicT* num_matches, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - PairEqual pair_equal) noexcept; - - private: - using device_view_base::impl_; ///< Implementation detail of `device_view` - }; // class device_view - - /** - * @brief Return the raw pointer of the hash map slots. - * - * @return Raw pointer of the hash map slots - */ - value_type* raw_slots() noexcept - { - // Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access. - // TODO: to be replace by atomic_ref when it's ready - return reinterpret_cast(slots_.get()); - } - - /** - * @brief Return the raw pointer of the hash map slots. - * - * @return Raw pointer of the hash map slots - */ - value_type const* raw_slots() const noexcept - { - // Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access. - // TODO: to be replace by atomic_ref when it's ready - return reinterpret_cast(slots_.get()); - } - - /** - * @brief Gets the maximum number of elements the hash map can hold. - * - * @return The maximum number of elements the hash map can hold - */ - std::size_t get_capacity() const noexcept { return capacity_; } - - /** - * @brief Gets the number of elements in the hash map. - * - * @param stream CUDA stream used to get the number of inserted elements - * @return The number of elements in the map - */ - std::size_t get_size(cudaStream_t stream = 0) const noexcept; - - /** - * @brief Gets the load factor of the hash map. - * - * @param stream CUDA stream used to get the load factor - * @return The load factor of the hash map - */ - float get_load_factor(cudaStream_t stream = 0) const noexcept; - - /** - * @brief Gets the sentinel value used to represent an empty key slot. - * - * @return The sentinel value used to represent an empty key slot - */ - Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; } - - /** - * @brief Gets the sentinel value used to represent an empty value slot. - * - * @return The sentinel value used to represent an empty value slot - */ - Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; } - - /** - * @brief Constructs a device_view object based on the members of the `static_multimap` - * object. - * - * @return A device_view object based on the members of the `static_multimap` object - */ - device_view get_device_view() const noexcept - { - return device_view(slots_.get(), - capacity_, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}); - } - - /** - * @brief Constructs a device_mutable_view object based on the members of the - * `static_multimap` object - * - * @return A device_mutable_view object based on the members of the `static_multimap` object - */ - device_mutable_view get_device_mutable_view() const noexcept - { - return device_mutable_view(slots_.get(), - capacity_, - empty_key{empty_key_sentinel_}, - empty_value{empty_value_sentinel_}); - } - - private: - std::size_t capacity_{}; ///< Total number of slots - Key empty_key_sentinel_{}; ///< Key value that represents an empty slot - Value empty_value_sentinel_{}; ///< Initial value of empty slot - allocator_type allocator_{}; ///< Allocator used to allocate slots - slot_deleter delete_slots_; ///< Custom slots deleter - std::unique_ptr slots_{}; ///< Pointer to flat slots storage -}; // class static_multimap } // namespace cuco -#include #include diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu index 6ceb92b0a..3a0a2dfa7 100644 --- a/tests/static_multimap/count_test.cu +++ b/tests/static_multimap/count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -90,14 +90,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::default_hash_function>>; - auto map = cuco::experimental::static_multimap, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_keys * multiplicity, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_multiplicity_count(map, num_keys); diff --git a/tests/static_multimap/find_test.cu b/tests/static_multimap/find_test.cu index 53e10f40d..68e477409 100644 --- a/tests/static_multimap/find_test.cu +++ b/tests/static_multimap/find_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -119,14 +119,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::default_hash_function>>; - auto map = cuco::experimental::static_multimap, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_keys, cuco::empty_key{KEY_SENTINEL}, cuco::empty_value{VAL_SENTINEL}}; test_multimap_find(map, num_keys); diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu index 826f915d6..fc511ee58 100644 --- a/tests/static_multimap/for_each_test.cu +++ b/tests/static_multimap/for_each_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -120,13 +120,13 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing>>; - auto set = cuco::experimental::static_multimap{num_keys, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - {}, - probe{}, - {}, - cuco::storage<2>{}}; + auto set = cuco::static_multimap{num_keys, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + {}, + probe{}, + {}, + cuco::storage<2>{}}; auto unique_keys_begin = thrust::counting_iterator(0); auto gen_duplicate_keys = cuda::proclaim_return_type( diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index 78bc5c2b0..13d8d1e4e 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ // insert key type template -struct key_pair { +struct __align__(sizeof(T) * 2) key_pair { T a; T b; @@ -73,19 +73,16 @@ struct custom_hasher { // User-defined device key equality struct custom_key_equal { template - __device__ bool operator()(LHS const& lhs, RHS const& rhs) + __device__ bool operator()(LHS const& lhs, RHS const& rhs) const { return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a; } }; -TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests", - "", -#if defined(CUCO_HAS_INDEPENDENT_THREADS) // Key type larger than 8B only supported for sm_70 and - // up - int64_t, -#endif - int32_t) +TEMPLATE_TEST_CASE( + "static_multimap heterogeneous lookup tests", + "", + int32_t) // key_pair = 8 bytes, key_pair = 16 bytes (exceeds 8-byte limit) { using Key = key_pair; using Value = TestType; @@ -98,10 +95,15 @@ TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests", constexpr std::size_t capacity = num * 2; cuco::static_multimap, cuda::thread_scope_device, - cuco::cuda_allocator, - cuco::legacy::linear_probing<1, custom_hasher>> - map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; + custom_key_equal, + cuco::linear_probing<1, custom_hasher>, + cuco::cuda_allocator, + cuco::storage<2>> + map{cuco::extent{capacity}, + cuco::empty_key{sentinel_key}, + cuco::empty_value{sentinel_value}}; auto insert_pairs = thrust::make_transform_iterator( thrust::counting_iterator(0), @@ -115,14 +117,14 @@ TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests", { thrust::device_vector contained(num); map.insert(insert_pairs, insert_pairs + num); - map.contains(probe_keys, probe_keys + num, contained.begin(), custom_key_equal{}); + map.contains(probe_keys, probe_keys + num, contained.begin()); REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); } SECTION("Non-inserted keys-value pairs should not be contained") { thrust::device_vector contained(num); - map.contains(probe_keys, probe_keys + num, contained.begin(), custom_key_equal{}); + map.contains(probe_keys, probe_keys + num, contained.begin()); REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{})); } } diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu index 4185c281a..834795a0a 100644 --- a/tests/static_multimap/insert_contains_test.cu +++ b/tests/static_multimap/insert_contains_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,14 +106,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::murmurhash3_32>>; - auto map = cuco::experimental::static_multimap, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ extent_type{num_keys}, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_insert(map, num_keys); diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 4045a77bd..a5cc07e62 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,14 +96,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::murmurhash3_32>>; - auto map = cuco::experimental::static_multimap, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_insert_if(map, num_keys); diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 23f9fb975..fa63a4beb 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * Copyright (c) 2021-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,14 +101,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::default_hash_function>>; - auto map = cuco::experimental::static_multimap, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_items * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_multiplicity_two(map, num_items); diff --git a/tests/static_multimap/retrieve_if_test.cu b/tests/static_multimap/retrieve_if_test.cu index 6a30ba38d..44770395f 100644 --- a/tests/static_multimap/retrieve_if_test.cu +++ b/tests/static_multimap/retrieve_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,7 +115,7 @@ TEMPLATE_TEST_CASE_SIG("static_multimap retrieve_if", { constexpr size_type num_keys{400}; - using container_type = cuco::experimental::static_multimap; + using container_type = cuco::static_multimap; container_type container{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/retrieve_test.cu b/tests/static_multimap/retrieve_test.cu index c30a807cb..23418f015 100644 --- a/tests/static_multimap/retrieve_test.cu +++ b/tests/static_multimap/retrieve_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -118,14 +118,14 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::default_hash_function>>; - auto map = cuco::experimental::static_multimap, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ + auto map = cuco::static_multimap, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_items * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; test_retrieve(map, num_items);