diff --git a/README.md b/README.md
index 7572dc1c1..fdd7ca0a0 100644
--- a/README.md
+++ b/README.md
@@ -233,7 +233,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJylVgtv2zYQ_isHDUXtVJYfaFDEjQN4bYoZK5whTlsUcaHQFG0TkUmNpOx6hv_77ijJlpsM67AWiCHe-7vvjtwFVlgrtbJB_34XyCTod8MgZWqRs4UI-gHPExaEgdW54fTdPpsqOIN3OtsauVg6aPAm9Dq9bgv_vA5h_Hn0fjSEdze3f9zcDu9GN-OIDLzRR8mFsiKBXCXCgFsKGGaM408pCeGzMJQN9KIONEhhGpSyadB8671sdQ4rtgWlHeRWoBtpYS5TAeI7F5kDqYDrVZZKpriAjXRLH6r049OBr6UTPXMM9RlaZPg1r2sCc4fU6d_Suazfbm82m4j5tCNtFu20ULbtj6N31-PJdQtTP5h9UinCC0b8mUuDhc-2wDLMjLMZ5puyDWgDbGEEypymzDdGOqkWIVg9dxtmhPeTSOuMnOXuBLwqT6y_roDwMYXADScwmkwD-HU4GU1C7-fL6O63m0938GV4ezsc342uJ3Bzi80avx9Rq_DrAwzHX-H30fh9CAKhw1Die2aoCkxVEqwiKTCcCHGSxlwXadlMcDmXHCoawUKvhVFYFmTCrGRBOEwy8X5SuZKOOX_2pDgfqj1VU_WLVDzNEwGXPOe6bcmEx6s8dXLFsojny6tTNbc0uXXtRKzRVbwW3GkTkdITFemEYShtc50rgj-uTp7Xt9hPgeR6XuoMUxbBWEU_ZuQrtf5QKocElKqx1jJpTtUO60Q6E0aPYhu7bSaQcgOkhHt7FK1ZmotCWIlIeLAQq8xtY_q0ggoRqWfuAFpd76VmX6gWBwflUpFUObbDUevBuqTft_IvNIQxqpx3XnY6nVKt3cZdgJom5w5bClU_irnrdjoh6oJNNUqLGlpd6rz_sb7TPhOqoe2zKb1WOdkIxtoVTOM0dxKVkfF8qVEFHpXekNcNTXqaIihWGIc5-rjo1IalQ6InzkWqWQJzRmSgeT_vvIh8ucgprPKUVJcVsGENuSvsW-bbBQjHGfTC0vgA_u5pG_anSt7b7rkW7PclsAWZ-v0T-l4WTjImzfO5XQHJbGPcrPXHCIYAMqhoS3V7tQiuBS4yggmW1I2NxuIcrjYblcbX0SIKYbdDPLGGXTfs4k8URYAnvXMiQnEM_qMU7usFHMahUWBWna_Yo4ifDNwlsvqq0WmGP69clEIEbTQru-JsJhY4YdXZ_TeI4xLOuMFyWrhN2OF2drlR8G_Q7iS8gMYY2tBrhiD3b2FfA3nkeYcTgBz0wWk8tWct0oWUaEkV7GycZlcmK1SC-R89TurdMnomPJsJdyC4iy68vnh5cXGx_2fCVKVceevY6XguMZDnx9GmYkajrnRM8OTU5xlCp84wakwxzJrz3BjyZSlxnzJea_eYdbE3mrRZ6NLFmwoFHqBqh5fu7vDsIdZ4mZkHsPl8Lr-jaoK3phMUhbkfQlEkhg8C1fLspfXQ9c58k_0mA3SX5S4mmuAO8zcGZV2E-em6m_9zOpMYr1Is1zZq-dSQ_ICRChI9bfbe702LgRAFZq3mktF9f9icdeI9HCI9VAwljGjSH-pl3ctvD5BoYdVLhzc9PiPCmi1KIysQwAQGg2evjIcDzGVBgnQ9vjhXRoq1-I8Qh0eQKrWCrD7KwWnZyFrU1lPDI7DEKacdS0Hlq5nwu9_nVS08sEudp_hEQ-6N4RX4QS-qs35o8b3ZOCHR4MdkBnAwPBClXC50XeKU4kuanqb4WDXHB3eg1px3e-d5F8U6c8VrPGjhvTvgr15130CLGb4c2FX8pgOtFl7KDv84rFkkrZStZv6JnspZzSfnPMXDdfGexgO8odVjsA8rOVL1RI5MDvbf_P-_AYKkJA4=))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVVg1v2zYQ_Ss3DUXlVJY_sKyI4wTwUg8zVjhDnLYo6kKgKdomIpEqSdn1DP_3HUnJkRcD61oghu6Ox3fvHo_cB5ppzaXQweDLPuBpMOhFQUbEqiQrFgwCWqYkiAItS0Xtd-diLuAC7mSxU3y1NhDSFvS7_V4b__wawfTj5N1kBHf3D3_dP4weJ_fT2C5wi95zyoRmKZQiZQrMmsGoIBR_Kk8EH5myaKAfdyG0AfOg8s2D1rXLspMl5GQHQhooNcM0XMOSZwzYd8oKA1wAlXmRcSIogy03a7dVlcfBgc9VErkwBOMJrijwa9mMBGKO0O2_tTHFoNPZbrcxcbBjqVadzAfrzvvJ3Xg6G7cR-nHZB5EhvaDYt5IrLHyxA1IgMkoWiDcjW5AKyEox9BlpkW8VN1ysItByabZEMZcn5doovijNCXk1Tqy_GYD0EYHEjWYwmc0D-G00m8wil-fT5PGP-w-P8Gn08DCaPk7GM7h_wGZN301sq_DrdxhNP8Ofk-m7CBhSh1ux74WyVSBUbmllqedwxtgJjKX0sHTBKF9yCrWMYCU3TAksCwqmcu4FhyBTlyfjOTfEONuL4txWnbmYi5-5oFmZMhjSksqOtktokpeZ4TkpYlqub0_DzFqV2nSoLIWJrfOFK2Ub3CXZMGqkOh_CDVMEvT4NlpDUlvPxGlvNUHfnvUYRoZGnPP43WC6xgYzkzsyFQXVyEW4kT1tzsUcSUOuWwCe2S8yuYKjHG9SLuX52bUhWMu-sXdZ5XMHywuwS-6mZLYVlTtY30O65LI31PtQbjsFVoA2l2CtjdQHapIOB5n_jQphiyGX3dbfbrcI6HRwUGKlKarDfUDfLH8petxthLOhMotfX0O5ZWbgf7WTgkNgaOg5NlbXGpGOYSuNlSO2h5BiMx4GuJYbAk5Bbm3Vrx0CWISmaKYMY3b6YVEdVQqtdPDSZJCksiZWDHQaX3VexKxcFh1WeKm5YExs1mLvFvhWuXYB0XEA_qhYfyd-_bMPhNMhl259rweFQEevlNBicCHjokxSEq_PYbsH6dDhtNfqDokMCCdTCtXW7sBjGDKecpQnWthtbicUZnHs6rhaP41UcwX6PfGIN-17Uw584jgEt_UsrBG8G91E5D80Cjgci9JzV9pw8seTFkRuiqm_Dbiv68WBfihVo2KrXeduCrfCE1bYvXyFJKjqTkJR2Grdgj6PblErAf1G75_AKwil0oN-KgB-u4dAgeeJ0hycANeg2t8dTOtWiXGyQnWBeneEpugosEynif844a3ZLyQVzara8g6Xbd6H_y-urq6tD3ay6mXotyyzFnm4Y9OuW2rungee8wOrSb91uiZHJkiMwX_Z1c12tprAZ-FzUidXVFkG3qco1o0-YjBhfF96DdubYuxovuB-BupAyu31e08Tobgvv0P8D3zFXHdbAi88Wvty5_h7xLlGQ9oIDJyY3MUGUeeLsOClr3E644YvszQ0rBDhHWb0pX0LYyHYDvkCoZo8by5jZwHCIT4FZSSle4T_BCBHikwCNLt578dbl2C4P3SesKa5v1znuGlyfzT1uDok-pKV73-BMccdDR3624vPGkMzKtd7eb-3l_Zz-4Murjp29SNCCD1D7osM3nnp-pwZiQ2mvf1n20C0L4x-xQRvh3dA3b3pvoU0UXd_oPHnbhXYbryuDfwySydJ2RvKFe9lmfNHISSnN0Ljxz1A0IOfiKThEtR_nwIkfVRccvrr__wANsNIK))
 
 ### `static_multiset`
 
diff --git a/benchmarks/static_multimap/count_bench.cu b/benchmarks/static_multimap/count_bench.cu
index 439a8e5c8..a605ba3c7 100644
--- a/benchmarks/static_multimap/count_bench.cu
+++ b/benchmarks/static_multimap/count_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count(
 
   state.add_element_count(num_keys);
 
-  cuco::experimental::static_multimap<Key, Value> map{
+  cuco::static_multimap<Key, Value> map{
     size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
   map.insert(pairs.begin(), pairs.end());
 
diff --git a/benchmarks/static_multimap/insert_bench.cu b/benchmarks/static_multimap/insert_bench.cu
index 0c5367c58..c9b54e51e 100644
--- a/benchmarks/static_multimap/insert_bench.cu
+++ b/benchmarks/static_multimap/insert_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,15 +56,15 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert(
 
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
-               cuco::experimental::static_multimap<Key, Value> map{size,
-                                                                   cuco::empty_key<Key>{-1},
-                                                                   cuco::empty_value<Value>{-1},
-                                                                   {},
-                                                                   {},
-                                                                   {},
-                                                                   {},
-                                                                   {},
-                                                                   {launch.get_stream()}};
+               cuco::static_multimap<Key, Value> map{size,
+                                                     cuco::empty_key<Key>{-1},
+                                                     cuco::empty_value<Value>{-1},
+                                                     {},
+                                                     {},
+                                                     {},
+                                                     {},
+                                                     {},
+                                                     {launch.get_stream()}};
 
                timer.start();
                map.insert(pairs.begin(), pairs.end(), {launch.get_stream()});
diff --git a/benchmarks/static_multimap/query_bench.cu b/benchmarks/static_multimap/query_bench.cu
index 82fc7c63e..74849b742 100644
--- a/benchmarks/static_multimap/query_bench.cu
+++ b/benchmarks/static_multimap/query_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,11 +62,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query(
   map.insert(pairs.begin(), pairs.end());
 
   auto const output_size = map.count(keys.begin(), keys.end());
-  thrust::device_vector<pair_type> output(output_size);
+  thrust::device_vector<Key> output_probe(output_size);
+  thrust::device_vector<pair_type> output_match(output_size);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto const count = map.count(keys.begin(), keys.end(), launch.get_stream());
-    map.retrieve(keys.begin(), keys.end(), output.begin(), launch.get_stream());
+    auto const count = map.count(keys.begin(), keys.end(), {launch.get_stream()});
+    map.retrieve(
+      keys.begin(), keys.end(), output_probe.begin(), output_match.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/benchmarks/static_multimap/retrieve_bench.cu b/benchmarks/static_multimap/retrieve_bench.cu
index 75dc79bd8..594533574 100644
--- a/benchmarks/static_multimap/retrieve_bench.cu
+++ b/benchmarks/static_multimap/retrieve_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,10 +62,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve(
   map.insert(pairs.begin(), pairs.end());
 
   auto const output_size = map.count(keys.begin(), keys.end());
-  thrust::device_vector<pair_type> output(output_size);
+  thrust::device_vector<Key> output_probe(output_size);
+  thrust::device_vector<pair_type> output_match(output_size);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.retrieve(keys.begin(), keys.end(), output.begin(), launch.get_stream());
+    map.retrieve(
+      keys.begin(), keys.end(), output_probe.begin(), output_match.begin(), {launch.get_stream()});
   });
 }
 
diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu
index 3a37bc191..c89cb306a 100644
--- a/examples/static_multimap/host_bulk_example.cu
+++ b/examples/static_multimap/host_bulk_example.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
 
 #include <cuco/static_multimap.cuh>
 
+#include <thrust/count.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sequence.h>
 #include <thrust/transform.h>
 
-#include <limits>
+#include <iostream>
 
 int main(void)
 {
@@ -52,23 +53,22 @@ int main(void)
   // Inserts all pairs into the map
   map.insert(pairs.begin(), pairs.end());
 
-  // Sequence of probe keys {0, 1, 2, ... 49'999}
-  thrust::device_vector<key_type> keys_to_find(N);
+  // Sequence of probe keys {0, 1, 2, ... 24'999}
+  // Each key should have 2 matches in the map
+  thrust::device_vector<key_type> keys_to_find(N / 2);
   thrust::sequence(keys_to_find.begin(), keys_to_find.end(), 0);
 
-  // Counts the occurrences of keys in [0, 50'000) contained in the multimap.
-  // The `_outer` suffix indicates that the occurrence of a non-match is 1.
-  auto const output_size = map.count_outer(keys_to_find.begin(), keys_to_find.end());
+  // Check that keys are contained in the map
+  thrust::device_vector<bool> contained(N / 2);
+  map.contains(keys_to_find.begin(), keys_to_find.end(), contained.begin());
 
-  thrust::device_vector<cuco::pair<key_type, value_type>> d_results(output_size);
+  // Verify all keys are found
+  auto const num_found = thrust::count(contained.begin(), contained.end(), true);
 
-  // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results`
-  // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel`
-  auto output_end = map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.begin());
-  auto retrieve_size = output_end - d_results.begin();
-
-  // The total number of outer matches should be `N + N / 2`
-  assert(not(output_size == retrieve_size == N + N / 2));
+  if (num_found == N / 2) {
+    std::cout << "Success! All " << N / 2 << " unique keys found in the multimap.\n";
+    std::cout << "Each key has 2 duplicate values, for a total of " << N << " pairs.\n";
+  }
 
   return 0;
 }
diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh
deleted file mode 100644
index 165b8e903..000000000
--- a/include/cuco/detail/probe_sequence_impl.cuh
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/detail/utils.cuh>
-#include <cuco/pair.cuh>
-
-#include <cuda/std/atomic>
-
-#include <cooperative_groups.h>
-
-#include <utility>
-
-namespace cuco::legacy::detail {
-
-/**
- * @brief Base class of public probe sequence. This class should not be used directly.
- *
- * @tparam CGSize Size of CUDA Cooperative Groups
- */
-template <uint32_t CGSize>
-class probe_sequence_base {
- protected:
-  /**
-   * @brief Returns the size of the CUDA cooperative thread group.
-   */
-  static constexpr std::size_t cg_size = CGSize;
-
-  /**
-   * @brief Returns the number of elements loaded with each vector load.
-   *
-   * @return The number of elements loaded with each vector load
-   */
-  static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; }
-};
-
-/**
- * @brief Base class of probe sequence implementation.
- *
- * Hash map operations are generally memory-bandwidth bound. A vector-load loads two consecutive
- * slots instead of one to fully utilize the 16B memory load supported by SASS/hardware thus
- * improve memory throughput. This method (flagged by `uses_vector_load` logic) is implicitly
- * applied to all hash map operations (e.g. `insert`, `count`, and `retrieve`, etc.) when pairs
- * are packable (see `cuco::detail::is_packable` logic).
- *
- * @tparam Key Type used for keys
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which multimap operations will be performed by
- * individual threads
- * @tparam VectorWidth Length of vector load
- * @tparam CGSize Size of CUDA Cooperative Groups
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          uint32_t VectorWidth,
-          uint32_t CGSize>
-class probe_sequence_impl_base {
- protected:
-  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
-  using key_type           = Key;                               ///< Key type
-  using mapped_type        = Value;                             ///< Type of mapped values
-  using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
-  using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
-  /// Pair type of atomic key and atomic mapped value
-  using pair_atomic_type = cuco::pair<atomic_key_type, atomic_mapped_type>;
-  /// Type of the forward iterator to `pair_atomic_type`
-  using iterator = pair_atomic_type*;
-  /// Type of the forward iterator to `const pair_atomic_type`
-  using const_iterator = pair_atomic_type const*;
-
-  /**
-   * @brief Returns the number of elements loaded with each vector-load.
-   */
-  static constexpr uint32_t vector_width = VectorWidth;
-
-  /**
-   * @brief Returns the size of the CUDA cooperative thread group.
-   */
-  static constexpr std::size_t cg_size = CGSize;
-
-  /**
-   * @brief Indicates if vector-load is used.
-   *
-   * Users have no explicit control on whether vector-load is used.
-   *
-   * @return Boolean indicating if vector-load is used.
-   */
-  __host__ __device__ static constexpr bool uses_vector_load() noexcept
-  {
-    return cuco::detail::is_packable<value_type>();
-  }
-
-  /**
-   * @brief Constructs a probe sequence based on the given hash map features.
-   *
-   * @param slots Pointer to beginning of the hash map slots
-   * @param capacity Capacity of the hash map
-   */
-  __host__ __device__ explicit probe_sequence_impl_base(iterator slots, std::size_t capacity)
-    : slots_{slots}, capacity_{capacity}
-  {
-  }
-
- public:
-  /**
-   * @brief Returns the capacity of the hash map.
-   *
-   * @return The capacity of the hash map
-   */
-  __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept
-  {
-    return capacity_;
-  }
-
-  /**
-   * @brief Returns slots array.
-   *
-   * @return Slots array
-   */
-  __device__ __forceinline__ iterator get_slots() noexcept { return slots_; }
-
-  /**
-   * @brief Returns slots array.
-   *
-   * @return Slots array
-   */
-  __device__ __forceinline__ const_iterator get_slots() const noexcept { return slots_; }
-
- protected:
-  iterator slots_;              ///< Pointer to beginning of the hash map slots
-  const std::size_t capacity_;  ///< Total number of slots
-};  // class probe_sequence_impl_base
-
-/**
- * @brief Cooperative Groups based Linear probing scheme.
- *
- * @tparam Key Type used for keys
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which multimap operations will be performed by
- * individual threads
- * @tparam VectorWidth Length of vector load
- * @tparam CGSize Size of CUDA Cooperative Groups
- * @tparam Hash Unary callable type
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          uint32_t VectorWidth,
-          int32_t CGSize,
-          typename Hash>
-class linear_probing_impl
-  : public probe_sequence_impl_base<Key, Value, Scope, VectorWidth, CGSize> {
- public:
-  using probe_sequence_impl_base_type =
-    probe_sequence_impl_base<Key, Value, Scope, VectorWidth, CGSize>;
-  using value_type         = typename probe_sequence_impl_base_type::value_type;
-  using key_type           = typename probe_sequence_impl_base_type::key_type;
-  using mapped_type        = typename probe_sequence_impl_base_type::mapped_type;
-  using atomic_key_type    = typename probe_sequence_impl_base_type::atomic_key_type;
-  using atomic_mapped_type = typename probe_sequence_impl_base_type::atomic_mapped_type;
-  using pair_atomic_type   = typename probe_sequence_impl_base_type::pair_atomic_type;
-  using iterator           = typename probe_sequence_impl_base_type::iterator;
-  using const_iterator     = typename probe_sequence_impl_base_type::const_iterator;
-
-  using probe_sequence_impl_base_type::capacity_;
-  using probe_sequence_impl_base_type::cg_size;
-  using probe_sequence_impl_base_type::slots_;
-  using probe_sequence_impl_base_type::uses_vector_load;
-  using probe_sequence_impl_base_type::vector_width;
-
-  /**
-   * @brief Constructs a linear probing scheme based on the given hash map features.
-   *
-   * @param slots Pointer to beginning of the hash map slots
-   * @param capacity Capacity of the hash map
-   * @param hash Unary function to hash each key
-   */
-  __host__ __device__ explicit linear_probing_impl(iterator slots, std::size_t capacity)
-    : probe_sequence_impl_base_type{slots, capacity}, hash_{Hash{}}
-  {
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`.
-   *
-   * If vector-load is enabled, the return slot is always even to avoid illegal memory access.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) noexcept
-  {
-    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`.
-   *
-   * If vector-load is enabled, the return slot is always even to avoid illegal memory access.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ const_iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) const noexcept
-  {
-    auto const hash_value = [&]() {
-      auto const tmp = hash_(k);
-      if constexpr (uses_vector_load()) {
-        // initial hash value is always even
-        return tmp + tmp % 2;
-      }
-      if constexpr (not uses_vector_load()) { return tmp; }
-    }();
-
-    auto const offset = [&]() {
-      if constexpr (uses_vector_load()) { return g.thread_rank() * vector_width; }
-      if constexpr (not uses_vector_load()) { return g.thread_rank(); }
-    }();
-
-    // Each CG accesses to a bucket of (`cg_size` * `vector_width`)
-    // slots if vector-load is used or `cg_size` slots otherwise
-    return &slots_[(hash_value + offset) % capacity_];
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ iterator next_slot(iterator s) noexcept
-  {
-    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept
-  {
-    std::size_t index = s - slots_;
-    std::size_t offset;
-    if constexpr (uses_vector_load()) {
-      offset = cg_size * vector_width;
-    } else {
-      offset = cg_size;
-    }
-    return &slots_[(index + offset) % capacity_];
-  }
-
- private:
-  Hash hash_;  ///< The unary callable used to hash the key
-};  // class linear_probing
-
-/**
- * @brief Cooperative Groups based double hashing scheme.
- *
- * Default probe sequence for `cuco::static_multimap`. Double hashing shows superior
- * performance when dealing with high multiplicty and/or high occupancy use cases. Performance
- * hints:
- * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise.
- *
- * `Hash1` and `Hash2` should be callable object type.
- *
- * @tparam Key Type used for keys
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which multimap operations will be performed by
- * individual threads
- * @tparam VectorWidth Length of vector load
- * @tparam CGSize Size of CUDA Cooperative Groups
- * @tparam Hash1 Unary callable type
- * @tparam Hash2 Unary callable type
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          uint32_t VectorWidth,
-          uint32_t CGSize,
-          typename Hash1,
-          typename Hash2>
-class double_hashing_impl
-  : public probe_sequence_impl_base<Key, Value, Scope, VectorWidth, CGSize> {
- public:
-  using probe_sequence_impl_base_type =
-    probe_sequence_impl_base<Key, Value, Scope, VectorWidth, CGSize>;
-  using value_type         = typename probe_sequence_impl_base_type::value_type;
-  using key_type           = typename probe_sequence_impl_base_type::key_type;
-  using mapped_type        = typename probe_sequence_impl_base_type::mapped_type;
-  using atomic_key_type    = typename probe_sequence_impl_base_type::atomic_key_type;
-  using atomic_mapped_type = typename probe_sequence_impl_base_type::atomic_mapped_type;
-  using pair_atomic_type   = typename probe_sequence_impl_base_type::pair_atomic_type;
-  using iterator           = typename probe_sequence_impl_base_type::iterator;
-  using const_iterator     = typename probe_sequence_impl_base_type::const_iterator;
-
-  using probe_sequence_impl_base_type::capacity_;
-  using probe_sequence_impl_base_type::cg_size;
-  using probe_sequence_impl_base_type::slots_;
-  using probe_sequence_impl_base_type::uses_vector_load;
-  using probe_sequence_impl_base_type::vector_width;
-
-  /**
-   * @brief Constructs a double hashing scheme based on the given hash map features.
-   *
-   * `hash2` takes a different seed to reduce the chance of secondary clustering.
-   *
-   * @param slots Pointer to beginning of the hash map slots
-   * @param capacity Capacity of the hash map
-   * @param hash1 First hasher to hash each key
-   * @param hash2 Second hasher to determine step size
-   */
-  __host__ __device__ explicit double_hashing_impl(iterator slots, std::size_t capacity)
-    : probe_sequence_impl_base_type{slots, capacity},
-      hash1_{Hash1{}},
-      hash2_{Hash2{1}},
-      step_size_{}
-  {
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`.
-   *
-   * If vector-load is enabled, the return slot is always a multiple of (`cg_size` * `vector_width`)
-   * to avoid illegal memory access.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) noexcept
-  {
-    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`.
-   *
-   * If vector-load is enabled, the return slot is always a multiple of (`cg_size` * `vector_width`)
-   * to avoid illegal memory access.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ const_iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) const noexcept
-  {
-    std::size_t index;
-    auto const hash_value = hash1_(k);
-    if constexpr (uses_vector_load()) {
-      // step size in range [1, prime - 1] * cg_size * vector_width
-      step_size_ =
-        (hash2_(k) % (capacity_ / (cg_size * vector_width) - 1) + 1) * cg_size * vector_width;
-      index = hash_value % (capacity_ / (cg_size * vector_width)) * cg_size * vector_width +
-              g.thread_rank() * vector_width;
-    } else {
-      // step size in range [1, prime - 1] * cg_size
-      step_size_ = (hash2_(k) % (capacity_ / cg_size - 1) + 1) * cg_size;
-      index      = (hash_value + g.thread_rank()) % capacity_;
-    }
-    return slots_ + index;
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ iterator next_slot(iterator s) noexcept
-  {
-    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept
-  {
-    std::size_t index = s - slots_;
-    return &slots_[(index + step_size_) % capacity_];
-  }
-
- private:
-  Hash1 hash1_;                    ///< The first unary callable used to hash the key
-  Hash2 hash2_;                    ///< The second unary callable used to determine step size
-  mutable std::size_t step_size_;  ///< The step stride when searching for the next slot
-};  // class double_hashing
-
-/**
- * @brief Probe sequence used internally by hash map.
- *
- * @tparam ProbeImpl Type of probe sequence implementation
- * @tparam Key Type used for keys
- * @tparam Value Type of the mapped values
- * @tparam Scope The scope in which multimap operations will be performed by
- * individual threads
- */
-template <typename ProbeImpl, typename Key, typename Value, cuda::thread_scope Scope>
-class probe_sequence : public ProbeImpl::template impl<Key, Value, Scope> {
- public:
-  using impl_type =
-    typename ProbeImpl::template impl<Key, Value, Scope>;  ///< Type of implementation details
-
-  /**
-   * @brief Constructs a probe sequence based on the given hash map features.
-   *
-   * @param slots Pointer to beginning of the hash map slots
-   * @param capacity Capacity of the hash map
-   */
-  __host__ __device__ explicit probe_sequence(typename impl_type::iterator slots,
-                                              std::size_t capacity)
-    : impl_type{slots, capacity}
-  {
-  }
-};  // class probe_sequence
-
-}  // namespace cuco::legacy::detail
diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl
deleted file mode 100644
index f97f3437e..000000000
--- a/include/cuco/detail/static_multimap/device_view_impl.inl
+++ /dev/null
@@ -1,1569 +0,0 @@
-/*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuco/detail/bitwise_compare.cuh>
-#include <cuco/detail/static_multimap/kernels.cuh>
-#include <cuco/detail/utils.cuh>
-
-#include <thrust/type_traits/is_contiguous_iterator.h>
-
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
-
-namespace cuco {
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_impl_base {
- protected:
-  // Import member type definitions from `static_multimap`
-  using value_type          = value_type;
-  using key_type            = Key;
-  using mapped_type         = Value;
-  using iterator            = pair_atomic_type*;
-  using const_iterator      = pair_atomic_type const*;
-  using probe_sequence_type = probe_sequence_type;
-
-  /**
-   * @brief Indicates if vector-load is used.
-   *
-   * Users have no explicit control on whether vector-load is used.
-   *
-   * @return Boolean indicating if vector-load is used.
-   */
-  static constexpr bool uses_vector_load() noexcept
-  {
-    return probe_sequence_type::uses_vector_load();
-  }
-
-  /**
-   * @brief Returns the number of pairs loaded with each vector-load
-   */
-  static constexpr uint32_t vector_width() noexcept { return probe_sequence_type::vector_width(); }
-
-  __host__ __device__ device_view_impl_base(pair_atomic_type* slots,
-                                            std::size_t capacity,
-                                            Key empty_key_sentinel,
-                                            Value empty_value_sentinel) noexcept
-    : probe_sequence_{slots, capacity},
-      empty_key_sentinel_{empty_key_sentinel},
-      empty_value_sentinel_{empty_value_sentinel}
-  {
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`
-   *
-   * To be used for Cooperative Group based probing.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ iterator
-  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-               ProbeKey const& k) noexcept
-  {
-    return probe_sequence_.initial_slot(g, k);
-  }
-
-  /**
-   * @brief Returns the initial slot for a given key `k`
-   *
-   * To be used for Cooperative Group based probing.
-   *
-   * @tparam ProbeKey Probe key type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g the Cooperative Group for which the initial slot is needed
-   * @param k The key to get the slot for
-   * @return Pointer to the initial slot for `k`
-   */
-  template <typename ProbeKey, typename ParentCG>
-  __device__ __forceinline__ const_iterator
-  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-               ProbeKey const& k) const noexcept
-  {
-    return probe_sequence_.initial_slot(g, k);
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot. To
-   * be used for Cooperative Group based probing.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ iterator next_slot(iterator s) noexcept
-  {
-    return probe_sequence_.next_slot(s);
-  }
-
-  /**
-   * @brief Given a slot `s`, returns the next slot.
-   *
-   * If `s` is the last slot, wraps back around to the first slot. To
-   * be used for Cooperative Group based probing.
-   *
-   * @param s The slot to advance
-   * @return The next slot after `s`
-   */
-  __device__ __forceinline__ const_iterator next_slot(const_iterator s) const noexcept
-  {
-    return probe_sequence_.next_slot(s);
-  }
-
-  /**
-   * @brief Load two key/value pairs from the given slot to the target pair array.
-   *
-   * @param arr The pair array to be loaded
-   * @param current_slot The given slot to load from
-   */
-  __device__ __forceinline__ void load_pair_array(value_type* arr,
-                                                  const_iterator current_slot) const noexcept
-  {
-    if constexpr (sizeof(value_type) == 4) {
-      auto const tmp = *reinterpret_cast<ushort4 const*>(current_slot);
-      memcpy(&arr[0], &tmp, 2 * sizeof(value_type));
-    } else {
-      auto const tmp = *reinterpret_cast<uint4 const*>(current_slot);
-      memcpy(&arr[0], &tmp, 2 * sizeof(value_type));
-    }
-  }
-
- public:
-  /**
-   * @brief Gets the sentinel value used to represent an empty key slot.
-   *
-   * @return The sentinel value used to represent an empty key slot
-   */
-  __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept
-  {
-    return empty_key_sentinel_;
-  }
-
-  /**
-   * @brief Gets the sentinel value used to represent an empty value slot.
-   *
-   * @return The sentinel value used to represent an empty value slot
-   */
-  __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept
-  {
-    return empty_value_sentinel_;
-  }
-
-  /**
-   * @brief Gets slots array.
-   *
-   * @return Slots array
-   */
-  __device__ __forceinline__ pair_atomic_type* get_slots() noexcept
-  {
-    return probe_sequence_.get_slots();
-  }
-
-  /**
-   * @brief Gets slots array.
-   *
-   * @return Slots array
-   */
-  __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept
-  {
-    return probe_sequence_.get_slots();
-  }
-
-  /**
-   * @brief Gets the maximum number of elements the hash map can hold.
-   *
-   * @return The maximum number of elements the hash map can hold
-   */
-  __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept
-  {
-    return probe_sequence_.get_capacity();
-  }
-
- private:
-  probe_sequence_type probe_sequence_;  ///< Probe sequence used to probe the hash map
-  Key empty_key_sentinel_{};            ///< Key value that represents an empty slot
-  Value empty_value_sentinel_{};        ///< Initial Value of empty slot
-};  // class device_view_impl_base
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_mutable_view_impl
-  : public device_view_impl_base {
- public:
-  using value_type     = typename device_view_impl_base::value_type;
-  using key_type       = typename device_view_impl_base::key_type;
-  using mapped_type    = typename device_view_impl_base::mapped_type;
-  using iterator       = typename device_view_impl_base::iterator;
-  using const_iterator = typename device_view_impl_base::const_iterator;
-
- private:
-  /**
-   * @brief Enumeration of the possible results of attempting to insert into a hash slot.
-   */
-  enum class insert_result {
-    CONTINUE,  ///< Insert did not succeed, continue trying to insert
-    SUCCESS,   ///< New pair inserted successfully
-    DUPLICATE  ///< Insert did not succeed, key is already present
-  };
-
-  /**
-   * @brief Inserts the specified key/value pair with one single CAS operation.
-   *
-   * @param current_slot The slot to insert
-   * @param insert_pair The pair to insert
-   * @param key_equal The binary callable used to compare two keys for
-   * equality
-   * @return An insert result from the `insert_resullt` enumeration.
-   */
-  __device__ __forceinline__ insert_result packed_cas(iterator current_slot,
-                                                      value_type const& insert_pair) noexcept
-  {
-    auto expected_key   = this->get_empty_key_sentinel();
-    auto expected_value = this->get_empty_value_sentinel();
-
-    cuco::detail::pair_converter<value_type> expected_pair{
-      cuco::make_pair(expected_key, expected_value)};
-    cuco::detail::pair_converter<value_type> new_pair{insert_pair};
-
-    auto slot = reinterpret_cast<
-      cuda::atomic<typename cuco::detail::pair_converter<value_type>::packed_type, Scope>*>(
-      current_slot);
-
-    bool success = slot->compare_exchange_strong(
-      expected_pair.packed, new_pair.packed, cuda::std::memory_order_relaxed);
-    if (success) { return insert_result::SUCCESS; }
-
-    return insert_result::CONTINUE;
-  }
-
-  /**
-   * @brief Inserts the specified key/value pair with two back-to-back CAS operations.
-   *
-   * @param current_slot The slot to insert
-   * @param insert_pair The pair to insert
-   * @return An insert result from the `insert_resullt` enumeration.
-   */
-  __device__ __forceinline__ insert_result back_to_back_cas(iterator current_slot,
-                                                            value_type const& insert_pair) noexcept
-  {
-    using cuda::std::memory_order_relaxed;
-
-    auto expected_key   = this->get_empty_key_sentinel();
-    auto expected_value = this->get_empty_value_sentinel();
-
-    // Back-to-back CAS for 8B/8B key/value pairs
-    auto& slot_key   = current_slot->first;
-    auto& slot_value = current_slot->second;
-
-    bool key_success =
-      slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
-    bool value_success =
-      slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed);
-
-    if (key_success) {
-      while (not value_success) {
-        value_success =
-          slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(),
-                                             insert_pair.second,
-                                             memory_order_relaxed);
-      }
-      return insert_result::SUCCESS;
-    } else if (value_success) {
-      slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed);
-    }
-
-    return insert_result::CONTINUE;
-  }
-
-  /**
-   * @brief Inserts the specified key/value pair with a CAS of the key and a dependent write
-   * of the value.
-   *
-   * @param current_slot The slot to insert
-   * @param insert_pair The pair to insert
-   * @return An insert result from the `insert_resullt` enumeration.
-   */
-  __device__ __forceinline__ insert_result
-  cas_dependent_write(iterator current_slot, value_type const& insert_pair) noexcept
-  {
-    using cuda::std::memory_order_relaxed;
-    auto expected_key = this->get_empty_key_sentinel();
-
-    auto& slot_key = current_slot->first;
-
-    auto const key_success =
-      slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
-
-    if (key_success) {
-      auto& slot_value = current_slot->second;
-      slot_value.store(insert_pair.second, memory_order_relaxed);
-      return insert_result::SUCCESS;
-    }
-
-    return insert_result::CONTINUE;
-  }
-
- public:
-  __host__ __device__ device_mutable_view_impl(pair_atomic_type* slots,
-                                               std::size_t capacity,
-                                               Key empty_key_sentinel,
-                                               Value empty_value_sentinel) noexcept
-    : device_view_impl_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-  {
-  }
-
-  /**
-   * @brief Inserts the specified key/value pair into the map using vector loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam CG Cooperative Group type
-   *
-   * @param g The Cooperative Group that performs the insert
-   * @param insert_pair The pair to insert
-   * @return void.
-   */
-  template <bool uses_vector_load, typename CG>
-  __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, void> insert(
-    CG g, value_type const& insert_pair) noexcept
-  {
-    auto current_slot = initial_slot(g, insert_pair.first);
-    while (true) {
-      value_type arr[2];
-      load_pair_array(&arr[0], current_slot);
-
-      // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-      // the sentinel is not a valid key value. Therefore, first check for the sentinel
-      auto const first_slot_is_empty =
-        (detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel()));
-      auto const second_slot_is_empty =
-        (detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel()));
-      auto const bucket_contains_empty = g.ballot(first_slot_is_empty or second_slot_is_empty);
-
-      if (bucket_contains_empty) {
-        // the first lane in the group with an empty slot will attempt the insert
-        insert_result status{insert_result::CONTINUE};
-        uint32_t src_lane = __ffs(bucket_contains_empty) - 1;
-        if (g.thread_rank() == src_lane) {
-          auto insert_location = first_slot_is_empty ? current_slot : current_slot + 1;
-          // One single CAS operation since vector loads are dedicated to packable pairs
-          status = packed_cas(insert_location, insert_pair);
-        }
-
-        // successful insert
-        if (g.any(status == insert_result::SUCCESS)) { return; }
-        // if we've gotten this far, a different key took our spot
-        // before we could insert. We need to retry the insert on the
-        // same bucket
-      }
-      // if there are no empty slots in the current bucket,
-      // we move onto the next bucket
-      else {
-        current_slot = next_slot(current_slot);
-      }
-    }  // while true
-  }
-
-  /**
-   * @brief Inserts the specified key/value pair into the map using scalar loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam CG Cooperative Group type
-   *
-   * @param g The Cooperative Group that performs the insert
-   * @param insert_pair The pair to insert
-   * @return void.
-   */
-  template <bool uses_vector_load, typename CG>
-  __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, void> insert(
-    CG g, value_type const& insert_pair) noexcept
-  {
-    auto current_slot = this->initial_slot(g, insert_pair.first);
-
-    while (true) {
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-      auto const& existing_key = slot_contents.first;
-
-      // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-      // the sentinel is not a valid key value. Therefore, first check for the sentinel
-      auto const slot_is_empty =
-        detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-      auto const bucket_contains_empty = g.ballot(slot_is_empty);
-
-      if (bucket_contains_empty) {
-        // the first lane in the group with an empty slot will attempt the insert
-        insert_result status{insert_result::CONTINUE};
-        uint32_t src_lane = __ffs(bucket_contains_empty) - 1;
-
-        if (g.thread_rank() == src_lane) {
-#if (__CUDA_ARCH__ < 700)
-          status = cas_dependent_write(current_slot, insert_pair);
-#else
-          status = back_to_back_cas(current_slot, insert_pair);
-#endif
-        }
-
-        // successful insert
-        if (g.any(status == insert_result::SUCCESS)) { return; }
-        // if we've gotten this far, a different key took our spot
-        // before we could insert. We need to retry the insert on the
-        // same bucket
-      }
-      // if there are no empty slots in the current bucket,
-      // we move onto the next bucket
-      else {
-        current_slot = this->next_slot(current_slot);
-      }
-    }  // while true
-  }
-};  // class device_mutable_view_impl
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_impl
-  : public device_view_impl_base {
- public:
-  using value_type     = typename device_view_impl_base::value_type;
-  using key_type       = typename device_view_impl_base::key_type;
-  using mapped_type    = typename device_view_impl_base::mapped_type;
-  using iterator       = typename device_view_impl_base::iterator;
-  using const_iterator = typename device_view_impl_base::const_iterator;
-
-  __host__ __device__ device_view_impl(pair_atomic_type* slots,
-                                       std::size_t capacity,
-                                       Key empty_key_sentinel,
-                                       Value empty_value_sentinel) noexcept
-    : device_view_impl_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-  {
-  }
-
-  /**
-   * @brief Flushes per-CG buffer into the output sequence.
-   *
-   * A given CUDA Cooperative Group, `g`, loads `num_outputs` key-value pairs from `output_buffer`
-   * and writes them into global memory in a coalesced fashion. CG-wide `memcpy_sync` is used if
-   * `thrust::is_contiguous_iterator_v<OutputIt>` returns true. All threads of `g` must be active
-   * due to implicit CG-wide synchronization during flushing.
-   *
-   * @tparam CG Cooperative Group type
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * constructible from the map's `value_type`
-   * @param g The Cooperative Group used to flush output buffer
-   * @param num_outputs Number of valid output in the buffer
-   * @param output_buffer Buffer of the key/value pair sequence
-   * @param num_matches Size of the output sequence
-   * @param output_begin Beginning of the output sequence of key/value pairs
-   */
-  template <typename CG, typename atomicT, typename OutputIt>
-  __device__ __forceinline__ void flush_output_buffer(CG g,
-                                                      uint32_t const num_outputs,
-                                                      value_type* output_buffer,
-                                                      atomicT* num_matches,
-                                                      OutputIt output_begin) noexcept
-  {
-    std::size_t offset;
-    const auto lane_id = g.thread_rank();
-    if (0 == lane_id) {
-      offset = num_matches->fetch_add(num_outputs, cuda::std::memory_order_relaxed);
-    }
-    offset = g.shfl(offset, 0);
-
-    if constexpr (thrust::is_contiguous_iterator_v<OutputIt>) {
-#if defined(CUCO_HAS_CUDA_BARRIER)
-      cooperative_groups::memcpy_async(
-        g,
-        &thrust::raw_reference_cast(*(output_begin + offset)),
-        output_buffer,
-        cuda::aligned_size_t<alignof(value_type)>(sizeof(value_type) * num_outputs));
-#else
-      cooperative_groups::memcpy_async(g,
-                                       &thrust::raw_reference_cast(*(output_begin + offset)),
-                                       output_buffer,
-                                       sizeof(value_type) * num_outputs);
-#endif  // end CUCO_HAS_CUDA_BARRIER
-    } else {
-      for (auto index = lane_id; index < num_outputs; index += g.size()) {
-        cuda::std::get<0>(*(output_begin + offset + index)) = output_buffer[index].first;
-        cuda::std::get<1>(*(output_begin + offset + index)) = output_buffer[index].second;
-      }
-    }
-  }
-
-  /**
-   * @brief Flushes per-CG buffer into the output sequences.
-   *
-   * A given CUDA Cooperative Group, `g`, loads `num_outputs` elements from `probe_output_buffer`
-   * and `num_outputs` elements from `contained_output_buffer`, then writes them into global
-   * memory started from `probe_output_begin` and `contained_output_begin` respectively. All
-   * threads of `g` must be active due to implicit CG-wide synchronization during flushing.
-   *
-   * @tparam CG Cooperative Group type
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `InputIt`s `value_type`.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `value_type`.
-   * @param g The Cooperative Group used to flush output buffer
-   * @param num_outputs Number of valid output in the buffer
-   * @param probe_output_buffer Buffer of the matched probe pair sequence
-   * @param contained_output_buffer Buffer of the matched contained pair sequence
-   * @param num_matches Size of the output sequence
-   * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-   * @param contained_output_begin Beginning of the output sequence of the matched contained
-   * pairs
-   */
-  template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
-  __device__ __forceinline__ void flush_output_buffer(CG g,
-                                                      uint32_t const num_outputs,
-                                                      value_type* probe_output_buffer,
-                                                      value_type* contained_output_buffer,
-                                                      atomicT* num_matches,
-                                                      OutputIt1 probe_output_begin,
-                                                      OutputIt2 contained_output_begin) noexcept
-  {
-    std::size_t offset;
-    const auto lane_id = g.thread_rank();
-    if (0 == lane_id) {
-      offset = num_matches->fetch_add(num_outputs, cuda::std::memory_order_relaxed);
-    }
-    offset = g.shfl(offset, 0);
-
-    for (auto index = lane_id; index < num_outputs; index += g.size()) {
-      auto& probe_pair                                          = probe_output_buffer[index];
-      auto& contained_pair                                      = contained_output_buffer[index];
-      cuda::std::get<0>(*(probe_output_begin + offset + index)) = probe_pair.first;
-      cuda::std::get<1>(*(probe_output_begin + offset + index)) = probe_pair.second;
-      cuda::std::get<0>(*(contained_output_begin + offset + index)) = contained_pair.first;
-      cuda::std::get<1>(*(contained_output_begin + offset + index)) = contained_pair.second;
-    }
-  }
-
-  /**
-   * @brief Indicates whether the probe `element` exists in the map using vector loads.
-   *
-   * If `element` was inserted into the map, `contains` returns true. Otherwise, it returns false.
-   * Uses the CUDA Cooperative Groups API to leverage multiple threads to perform a single
-   * `contains` operation. This provides a significant boost in throughput compared to the non
-   * Cooperative Group based `contains` at moderate to high load factors.
-   *
-   * @tparam is_pair_contains `true` if it's a `pair_contains` implementation
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam ProbeT Probe data type
-   * @tparam Equal Binary callable type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g The Cooperative Group used to perform the contains operation
-   * @param element The probe element to search for
-   * @param equal The binary function to compare input element and slot content for equality
-   * @return A boolean indicating whether the key/value pair represented by `element` was inserted
-   */
-  template <bool is_pair_contains,
-            bool uses_vector_load,
-            typename ProbeT,
-            typename Equal,
-            typename ParentCG>
-  __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, bool> contains(
-    cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-    ProbeT const& element,
-    Equal equal) const noexcept
-  {
-    auto current_slot = [&]() {
-      if constexpr (is_pair_contains) { return initial_slot(g, element.first); }
-      if constexpr (not is_pair_contains) { return initial_slot(g, element); }
-    }();
-
-    while (true) {
-      value_type arr[2];
-      load_pair_array(&arr[0], current_slot);
-
-      auto const first_slot_is_empty =
-        detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-      auto const second_slot_is_empty =
-        detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-      auto const first_equals = [&]() {
-        if constexpr (is_pair_contains) {
-          return not first_slot_is_empty and equal(arr[0], element);
-        }
-        if constexpr (not is_pair_contains) {
-          return not first_slot_is_empty and equal(arr[0].first, element);
-        }
-      }();
-      auto const second_equals = [&]() {
-        if constexpr (is_pair_contains) {
-          return not second_slot_is_empty and equal(arr[1], element);
-        }
-        if constexpr (not is_pair_contains) {
-          return not second_slot_is_empty and equal(arr[1].first, element);
-        }
-      }();
-
-      // the key we were searching for was found by one of the threads, so we return true
-      if (g.any(first_equals or second_equals)) { return true; }
-
-      // we found an empty slot, meaning that the key we're searching for isn't present
-      if (g.any(first_slot_is_empty or second_slot_is_empty)) { return false; }
-
-      // otherwise, all slots in the current bucket are full with other keys, so we move onto the
-      // next bucket
-      current_slot = next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Indicates whether `element` exists in the map using scalar loads.
-   *
-   * If `element` was inserted into the map, `contains` returns true. Otherwise, it returns false.
-   * Uses the CUDA Cooperative Groups API to leverage multiple threads to perform a single
-   * `contains` operation. This provides a significant boost in throughput compared to the non
-   * Cooperative Group `contains` at moderate to high load factors.
-   *
-   * @tparam is_pair_contains `true` if it's a `pair_contains` implementation
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam ProbeT Probe data type
-   * @tparam Equal Binary callable type
-   * @tparam ParentCG Type of parent Cooperative Group
-   *
-   * @param g The Cooperative Group used to perform the contains operation
-   * @param element The probe element to search for
-   * @param equal The binary function to compare input element and slot content for equality
-   * @return A boolean indicating whether the key/value pair represented by `element` was inserted
-   */
-  template <bool is_pair_contains,
-            bool uses_vector_load,
-            typename ProbeT,
-            typename Equal,
-            typename ParentCG>
-  __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, bool> contains(
-    cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-    ProbeT const& element,
-    Equal equal) const noexcept
-  {
-    auto current_slot = [&]() {
-      if constexpr (is_pair_contains) { return this->initial_slot(g, element.first); }
-      if constexpr (not is_pair_contains) { return this->initial_slot(g, element); }
-    }();
-
-    while (true) {
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-      auto const& existing_key = slot_contents.first;
-
-      // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as
-      // the sentinel is not a valid key value. Therefore, first check for the sentinel
-      auto const slot_is_empty =
-        detail::bitwise_compare(existing_key, this->get_empty_key_sentinel());
-
-      auto const equals = [&]() {
-        if constexpr (is_pair_contains) {
-          return not slot_is_empty and equal(slot_contents, element);
-        }
-        if constexpr (not is_pair_contains) {
-          return not slot_is_empty and equal(existing_key, element);
-        }
-      }();
-
-      // the key we were searching for was found by one of the threads, so we return true
-      if (g.any(equals)) { return true; }
-
-      // we found an empty slot, meaning that the key we're searching for isn't present
-      if (g.any(slot_is_empty)) { return false; }
-
-      // otherwise, all slots in the current bucket are full with other keys, so we move onto the
-      // next bucket
-      current_slot = this->next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Counts the occurrence of a given key contained in multimap using vector loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam KeyEqual Binary callable type
-   * @param g The Cooperative Group used to perform the count operation
-   * @param k The key to search for
-   * @param key_equal The binary callable used to compare two keys
-   * for equality
-   * @return Number of matches found by the current thread
-   */
-  template <bool uses_vector_load, bool is_outer, typename CG, typename KeyEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, std::size_t> count(
-    CG g, Key const& k, KeyEqual key_equal) noexcept
-  {
-    std::size_t count = 0;
-    auto current_slot = initial_slot(g, k);
-
-    [[maybe_unused]] bool found_match = false;
-
-    while (true) {
-      value_type arr[2];
-      load_pair_array(&arr[0], current_slot);
-
-      auto const first_slot_is_empty =
-        detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-      auto const second_slot_is_empty =
-        detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-      auto const first_equals  = (not first_slot_is_empty and key_equal(arr[0].first, k));
-      auto const second_equals = (not second_slot_is_empty and key_equal(arr[1].first, k));
-
-      if constexpr (is_outer) {
-        if (g.any(first_equals or second_equals)) { found_match = true; }
-      }
-
-      count += (first_equals + second_equals);
-
-      if (g.any(first_slot_is_empty or second_slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) && (g.thread_rank() == 0)) { count++; }
-        }
-        return count;
-      }
-
-      current_slot = next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Counts the occurrence of a given key contained in multimap using scalar loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam KeyEqual Binary callable type
-   * @param g The Cooperative Group used to perform the count operation
-   * @param k The key to search for
-   * @param key_equal The binary callable used to compare two keys
-   * for equality
-   * @return Number of matches found by the current thread
-   */
-  template <bool uses_vector_load, bool is_outer, typename CG, typename KeyEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, std::size_t> count(
-    CG g, Key const& k, KeyEqual key_equal) noexcept
-  {
-    std::size_t count = 0;
-    auto current_slot = initial_slot(g, k);
-
-    [[maybe_unused]] bool found_match = false;
-
-    while (true) {
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-      auto const& current_key  = slot_contents.first;
-
-      auto const slot_is_empty =
-        detail::bitwise_compare(current_key, this->get_empty_key_sentinel());
-      auto const equals = not slot_is_empty and key_equal(current_key, k);
-
-      if constexpr (is_outer) {
-        if (g.any(equals)) { found_match = true; }
-      }
-
-      count += equals;
-
-      if (g.any(slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) && (g.thread_rank() == 0)) { count++; }
-        }
-        return count;
-      }
-
-      current_slot = next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Counts the occurrence of a given key/value pair contained in multimap using vector
-   * loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam PairEqual Binary callable type
-   * @param g The Cooperative Group used to perform the pair_count operation
-   * @param pair The pair to search for
-   * @param pair_equal The binary callable used to compare two pairs
-   * for equality
-   * @return Number of matches found by the current thread
-   */
-  template <bool uses_vector_load, bool is_outer, typename CG, typename PairEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, std::size_t> pair_count(
-    CG g, value_type const& pair, PairEqual pair_equal) noexcept
-  {
-    std::size_t count = 0;
-    auto key          = pair.first;
-    auto current_slot = initial_slot(g, key);
-
-    [[maybe_unused]] bool found_match = false;
-
-    while (true) {
-      value_type arr[2];
-      load_pair_array(&arr[0], current_slot);
-
-      auto const first_slot_is_empty =
-        detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-      auto const second_slot_is_empty =
-        detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-
-      auto const first_slot_equals  = (not first_slot_is_empty and pair_equal(arr[0], pair));
-      auto const second_slot_equals = (not second_slot_is_empty and pair_equal(arr[1], pair));
-
-      if constexpr (is_outer) {
-        if (g.any(first_slot_equals or second_slot_equals)) { found_match = true; }
-      }
-
-      count += (first_slot_equals + second_slot_equals);
-
-      if (g.any(first_slot_is_empty or second_slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) && (g.thread_rank() == 0)) { count++; }
-        }
-        return count;
-      }
-
-      current_slot = next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Counts the occurrence of a given key/value pair contained in multimap using scalar
-   * loads.
-   *
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam PairEqual Binary callable type
-   * @param g The Cooperative Group used to perform the pair_count operation
-   * @param pair The pair to search for
-   * @param pair_equal The binary callable used to compare two pairs
-   * for equality
-   * @return Number of matches found by the current thread
-   */
-  template <bool uses_vector_load, bool is_outer, typename CG, typename PairEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, std::size_t> pair_count(
-    CG g, value_type const& pair, PairEqual pair_equal) noexcept
-  {
-    std::size_t count = 0;
-    auto key          = pair.first;
-    auto current_slot = initial_slot(g, key);
-
-    [[maybe_unused]] bool found_match = false;
-
-    while (true) {
-      auto slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-
-      auto const slot_is_empty =
-        detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel());
-
-      auto const equals = not slot_is_empty and pair_equal(slot_contents, pair);
-
-      if constexpr (is_outer) {
-        if (g.any(equals)) { found_match = true; }
-      }
-
-      count += equals;
-
-      if (g.any(slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) && (g.thread_rank() == 0)) { count++; }
-        }
-        return count;
-      }
-
-      current_slot = next_slot(current_slot);
-    }
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given key contained in multimap using vector
-   * loads with per-flushing-CG shared memory buffer.
-   *
-   * For key `k` existing in the map, copies `k` and all associated values to unspecified
-   * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and
-   * `empty_value_sentinel()` into the output only if `is_outer` is true.
-   *
-   * @tparam buffer_size Size of the output buffer
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-   * @tparam ProbingCG Type of Cooperative Group used to retrieve
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * constructible from the map's `value_type`
-   * @tparam KeyEqual Binary callable type
-   * @param flushing_cg The Cooperative Group used to flush output buffer
-   * @param probing_cg The Cooperative Group used to retrieve
-   * @param k The key to search for
-   * @param flushing_cg_counter Pointer to the flushing cg counter
-   * @param output_buffer Shared memory buffer of the key/value pair sequence
-   * @param num_matches Size of the output sequence
-   * @param output_begin Beginning of the output sequence of key/value pairs
-   * @param key_equal The binary callable used to compare two keys
-   * for equality
-   */
-  template <uint32_t buffer_size,
-            bool is_outer,
-            typename FlushingCG,
-            typename ProbingCG,
-            typename atomicT,
-            typename OutputIt,
-            typename KeyEqual>
-  __device__ __forceinline__ void retrieve(FlushingCG flushing_cg,
-                                           ProbingCG probing_cg,
-                                           Key const& k,
-                                           uint32_t* flushing_cg_counter,
-                                           value_type* output_buffer,
-                                           atomicT* num_matches,
-                                           OutputIt output_begin,
-                                           KeyEqual key_equal) noexcept
-  {
-    const uint32_t cg_lane_id = probing_cg.thread_rank();
-
-    auto current_slot = initial_slot(probing_cg, k);
-
-    bool running                      = true;
-    [[maybe_unused]] bool found_match = false;
-
-    while (flushing_cg.any(running)) {
-      if (running) {
-        value_type arr[2];
-        load_pair_array(&arr[0], current_slot);
-
-        auto const first_slot_is_empty =
-          detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-        auto const second_slot_is_empty =
-          detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-        auto const first_equals  = (not first_slot_is_empty and key_equal(arr[0].first, k));
-        auto const second_equals = (not second_slot_is_empty and key_equal(arr[1].first, k));
-        auto const first_exists  = probing_cg.ballot(first_equals);
-        auto const second_exists = probing_cg.ballot(second_equals);
-
-        if (first_exists or second_exists) {
-          if constexpr (is_outer) { found_match = true; }
-
-          auto const num_first_matches  = __popc(first_exists);
-          auto const num_second_matches = __popc(second_exists);
-
-          uint32_t output_idx;
-          if (0 == cg_lane_id) {
-            output_idx = atomicAdd(flushing_cg_counter, (num_first_matches + num_second_matches));
-          }
-          output_idx = probing_cg.shfl(output_idx, 0);
-
-          if (first_equals) {
-            auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id);
-            output_buffer[output_idx + lane_offset] = cuco::make_pair(k, arr[0].second);
-          }
-          if (second_equals) {
-            auto const lane_offset =
-              detail::count_least_significant_bits(second_exists, cg_lane_id);
-            output_buffer[output_idx + num_first_matches + lane_offset] =
-              cuco::make_pair(k, arr[1].second);
-          }
-        }
-        if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) {
-          running = false;
-          if constexpr (is_outer) {
-            if ((not found_match) && (cg_lane_id == 0)) {
-              auto const output_idx     = atomicAdd(flushing_cg_counter, 1);
-              output_buffer[output_idx] = cuco::make_pair(k, this->get_empty_value_sentinel());
-            }
-          }
-        }
-      }  // if running
-
-      flushing_cg.sync();
-      if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) {
-        flush_output_buffer(
-          flushing_cg, *flushing_cg_counter, output_buffer, num_matches, output_begin);
-        // Everyone in the group reads the counter when flushing, so
-        // sync before writing.
-        flushing_cg.sync();
-        // First lane reset warp-level counter
-        if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; }
-        flushing_cg.sync();
-      }
-
-      current_slot = next_slot(current_slot);
-    }  // while running
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given key contained in multimap using scalar
-   * loads with per-CG shared memory buffer.
-   *
-   * For key `k` existing in the map, copies `k` and all associated values to unspecified
-   * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and
-   * `empty_value_sentinel()` into the output only if `is_outer` is true.
-   *
-   * @tparam buffer_size Size of the output buffer
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * constructible from the map's `value_type`
-   * @tparam KeyEqual Binary callable type
-   * @param g The Cooperative Group used to retrieve
-   * @param k The key to search for
-   * @param cg_counter Pointer to the CG counter
-   * @param output_buffer Shared memory buffer of the key/value pair sequence
-   * @param num_matches Size of the output sequence
-   * @param output_begin Beginning of the output sequence of key/value pairs
-   * @param key_equal The binary callable used to compare two keys
-   * for equality
-   */
-  template <uint32_t buffer_size,
-            bool is_outer,
-            typename CG,
-            typename atomicT,
-            typename OutputIt,
-            typename KeyEqual>
-  __device__ __forceinline__ void retrieve(CG g,
-                                           Key const& k,
-                                           uint32_t* cg_counter,
-                                           value_type* output_buffer,
-                                           atomicT* num_matches,
-                                           OutputIt output_begin,
-                                           KeyEqual key_equal) noexcept
-  {
-    const uint32_t lane_id = g.thread_rank();
-
-    auto current_slot = initial_slot(g, k);
-
-    bool running                      = true;
-    [[maybe_unused]] bool found_match = false;
-
-    while (running) {
-      // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation
-      // is unsafe!
-      static_assert(sizeof(Key) == sizeof(cuda::atomic<Key>));
-      static_assert(sizeof(Value) == sizeof(cuda::atomic<Value>));
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-
-      auto const slot_is_empty =
-        detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel());
-      auto const equals = (not slot_is_empty and key_equal(slot_contents.first, k));
-      auto const exists = g.ballot(equals);
-
-      uint32_t output_idx = *cg_counter;
-
-      if (exists) {
-        if constexpr (is_outer) { found_match = true; }
-        auto const num_matches = __popc(exists);
-        if (equals) {
-          // Each match computes its lane-level offset
-          auto const lane_offset = detail::count_least_significant_bits(exists, lane_id);
-          output_buffer[output_idx + lane_offset] = cuco::make_pair(k, slot_contents.second);
-        }
-        if (0 == lane_id) { (*cg_counter) += num_matches; }
-      }
-      if (g.any(slot_is_empty)) {
-        running = false;
-        if constexpr (is_outer) {
-          if ((not found_match) && (lane_id == 0)) {
-            output_idx                = (*cg_counter)++;
-            output_buffer[output_idx] = cuco::make_pair(k, this->get_empty_value_sentinel());
-          }
-        }
-      }
-
-      g.sync();
-
-      // Flush if the next iteration won't fit into buffer
-      if ((*cg_counter + g.size()) > buffer_size) {
-        flush_output_buffer(g, *cg_counter, output_buffer, num_matches, output_begin);
-        // Everyone in the group reads the counter when flushing, so
-        // sync before writing.
-        g.sync();
-        // First lane reset CG-level counter
-        if (lane_id == 0) { *cg_counter = 0; }
-        g.sync();
-      }
-      current_slot = next_slot(current_slot);
-    }  // while running
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given pair using vector loads.
-   *
-   * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores
-   * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] =
-   * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where
-   * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`,
-   * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and
-   * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true.
-   *
-   * Concurrent reads or writes to any of the output ranges results in undefined behavior.
-   *
-   * Behavior is undefined if the extent of any of the output ranges is less than `n`.
-   *
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam ProbingCG Type of Cooperative Group used to retrieve
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `pair`'s `Key` type.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * `pair`'s `Value` type.
-   * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `key_type`.
-   * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `mapped_type`.
-   * @tparam PairEqual Binary callable type
-   * @param probing_cg The Cooperative Group used to retrieve
-   * @param pair The pair to search for
-   * @param probe_key_begin Beginning of the output sequence of the matched probe keys
-   * @param probe_val_begin Beginning of the output sequence of the matched probe values
-   * @param contained_key_begin Beginning of the output sequence of the matched contained keys
-   * @param contained_val_begin Beginning of the output sequence of the matched contained values
-   * @param pair_equal The binary callable used to compare two pairs for equality
-   */
-  template <bool is_outer,
-            bool uses_vector_load,
-            typename ProbingCG,
-            typename OutputIt1,
-            typename OutputIt2,
-            typename OutputIt3,
-            typename OutputIt4,
-            typename PairEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, void> pair_retrieve(
-    ProbingCG probing_cg,
-    value_type const& pair,
-    OutputIt1 probe_key_begin,
-    OutputIt2 probe_val_begin,
-    OutputIt3 contained_key_begin,
-    OutputIt4 contained_val_begin,
-    PairEqual pair_equal) noexcept
-  {
-    auto const lane_id                = probing_cg.thread_rank();
-    auto current_slot                 = initial_slot(probing_cg, pair.first);
-    [[maybe_unused]] auto found_match = false;
-
-    auto num_matches = 0;
-
-    while (true) {
-      value_type arr[2];
-      load_pair_array(&arr[0], current_slot);
-
-      auto const first_slot_is_empty =
-        detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-      auto const second_slot_is_empty =
-        detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-      auto const first_equals  = (not first_slot_is_empty and pair_equal(arr[0], pair));
-      auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair));
-      auto const first_exists  = probing_cg.ballot(first_equals);
-      auto const second_exists = probing_cg.ballot(second_equals);
-
-      if (first_exists or second_exists) {
-        if constexpr (is_outer) { found_match = true; }
-
-        auto const num_first_matches = __popc(first_exists);
-
-        if (first_equals) {
-          auto lane_offset      = detail::count_least_significant_bits(first_exists, lane_id);
-          auto const output_idx = num_matches + lane_offset;
-
-          *(probe_key_begin + output_idx)     = pair.first;
-          *(probe_val_begin + output_idx)     = pair.second;
-          *(contained_key_begin + output_idx) = arr[0].first;
-          *(contained_val_begin + output_idx) = arr[0].second;
-        }
-        if (second_equals) {
-          auto const lane_offset = detail::count_least_significant_bits(second_exists, lane_id);
-          auto const output_idx  = num_matches + num_first_matches + lane_offset;
-
-          *(probe_key_begin + output_idx)     = pair.first;
-          *(probe_val_begin + output_idx)     = pair.second;
-          *(contained_key_begin + output_idx) = arr[1].first;
-          *(contained_val_begin + output_idx) = arr[1].second;
-        }
-        num_matches += (num_first_matches + __popc(second_exists));
-      }
-      if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) and lane_id == 0) {
-            *(probe_key_begin)     = pair.first;
-            *(probe_val_begin)     = pair.second;
-            *(contained_key_begin) = this->get_empty_key_sentinel();
-            *(contained_val_begin) = this->get_empty_value_sentinel();
-          }
-        }
-        return;  // exit if any slot in the current bucket is empty
-      }
-
-      current_slot = next_slot(current_slot);
-    }  // while
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given pair using scalar loads.
-   *
-   * For pair `p` with `n` matching pairs, if `pair_equal(p, slot)` returns true, stores
-   * `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`, `contained_key_begin[j] =
-   * slot.first`, and `contained_val_begin[j] = slot.second` for an unspecified value of `j` where
-   * `0 <= j < n`. If `p` does not have any matches, stores `probe_key_begin[0] = p.first`,
-   * `probe_val_begin[0] = p.second`, `contained_key_begin[0] = empty_key_sentinel`, and
-   * `contained_val_begin[0] = empty_value_sentinel` only if `is_outer` is true.
-   *
-   * Concurrent reads or writes to any of the output ranges results in undefined behavior.
-   *
-   * Behavior is undefined if the extent of any of the output ranges is less than `n`.
-   *
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam uses_vector_load Boolean flag indicating whether vector loads are used
-   * @tparam ProbingCG Type of Cooperative Group used to retrieve
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `pair`'s `Key` type.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * `pair`'s `Value` type.
-   * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `key_type`.
-   * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `mapped_type`.
-   * @tparam PairEqual Binary callable type
-   * @param probing_cg The Cooperative Group used to retrieve
-   * @param pair The pair to search for
-   * @param probe_key_begin Beginning of the output sequence of the matched probe keys
-   * @param probe_val_begin Beginning of the output sequence of the matched probe values
-   * @param contained_key_begin Beginning of the output sequence of the matched contained keys
-   * @param contained_val_begin Beginning of the output sequence of the matched contained values
-   * @param pair_equal The binary callable used to compare two pairs for equality
-   */
-  template <bool is_outer,
-            bool uses_vector_load,
-            typename ProbingCG,
-            typename OutputIt1,
-            typename OutputIt2,
-            typename OutputIt3,
-            typename OutputIt4,
-            typename PairEqual>
-  __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, void> pair_retrieve(
-    ProbingCG probing_cg,
-    value_type const& pair,
-    OutputIt1 probe_key_begin,
-    OutputIt2 probe_val_begin,
-    OutputIt3 contained_key_begin,
-    OutputIt4 contained_val_begin,
-    PairEqual pair_equal) noexcept
-  {
-    auto const lane_id                = probing_cg.thread_rank();
-    auto current_slot                 = initial_slot(probing_cg, pair.first);
-    [[maybe_unused]] auto found_match = false;
-
-    auto num_matches = 0;
-
-    while (true) {
-      // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation
-      // is unsafe!
-      static_assert(sizeof(Key) == sizeof(cuda::atomic<Key>));
-      static_assert(sizeof(Value) == sizeof(cuda::atomic<Value>));
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-
-      auto const slot_is_empty =
-        detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel());
-      auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair));
-      auto const exists = probing_cg.ballot(equals);
-
-      if (exists) {
-        if constexpr (is_outer) { found_match = true; }
-
-        if (equals) {
-          auto const lane_offset = detail::count_least_significant_bits(exists, lane_id);
-          auto const output_idx  = num_matches + lane_offset;
-
-          *(probe_key_begin + output_idx)     = pair.first;
-          *(probe_val_begin + output_idx)     = pair.second;
-          *(contained_key_begin + output_idx) = slot_contents.first;
-          *(contained_val_begin + output_idx) = slot_contents.second;
-        }
-        num_matches += __popc(exists);
-      }
-      if (probing_cg.any(slot_is_empty)) {
-        if constexpr (is_outer) {
-          if ((not found_match) and lane_id == 0) {
-            *(probe_key_begin)     = pair.first;
-            *(probe_val_begin)     = pair.second;
-            *(contained_key_begin) = this->get_empty_key_sentinel();
-            *(contained_val_begin) = this->get_empty_value_sentinel();
-          }
-        }
-        return;  // exit if any slot in the current bucket is empty
-      }
-
-      current_slot = next_slot(current_slot);
-    }  // while
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given pair contained in multimap using vector
-   * loads with per-flushing-CG shared memory buffer.
-   *
-   * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations
-   * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in
-   * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies
-   * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if
-   * `is_outer` is true.
-   *
-   * @tparam buffer_size Size of the output buffer
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-   * @tparam ProbingCG Type of Cooperative Group used to retrieve
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `InputIt`s `value_type`.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `value_type`.
-   * @tparam PairEqual Binary callable type
-   * @param flushing_cg The Cooperative Group used to flush output buffer
-   * @param probing_cg The Cooperative Group used to retrieve
-   * @param pair The pair to search for
-   * @param flushing_cg_counter Pointer to the flushing CG counter
-   * @param probe_output_buffer Buffer of the matched probe pair sequence
-   * @param contained_output_buffer Buffer of the matched contained pair sequence
-   * @param num_matches Size of the output sequence
-   * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-   * @param contained_output_begin Beginning of the output sequence of the matched contained
-   * pairs
-   * @param pair_equal The binary callable used to compare two pairs for equality
-   */
-  template <uint32_t buffer_size,
-            bool is_outer,
-            typename FlushingCG,
-            typename ProbingCG,
-            typename atomicT,
-            typename OutputIt1,
-            typename OutputIt2,
-            typename PairEqual>
-  __device__ __forceinline__ void pair_retrieve(FlushingCG flushing_cg,
-                                                ProbingCG probing_cg,
-                                                value_type const& pair,
-                                                uint32_t* flushing_cg_counter,
-                                                value_type* probe_output_buffer,
-                                                value_type* contained_output_buffer,
-                                                atomicT* num_matches,
-                                                OutputIt1 probe_output_begin,
-                                                OutputIt2 contained_output_begin,
-                                                PairEqual pair_equal) noexcept
-  {
-    const uint32_t cg_lane_id = probing_cg.thread_rank();
-
-    auto key          = pair.first;
-    auto current_slot = initial_slot(probing_cg, key);
-
-    bool running                      = true;
-    [[maybe_unused]] bool found_match = false;
-
-    while (flushing_cg.any(running)) {
-      if (running) {
-        value_type arr[2];
-        load_pair_array(&arr[0], current_slot);
-
-        auto const first_slot_is_empty =
-          detail::bitwise_compare(arr[0].first, this->get_empty_key_sentinel());
-        auto const second_slot_is_empty =
-          detail::bitwise_compare(arr[1].first, this->get_empty_key_sentinel());
-        auto const first_equals  = (not first_slot_is_empty and pair_equal(arr[0], pair));
-        auto const second_equals = (not second_slot_is_empty and pair_equal(arr[1], pair));
-        auto const first_exists  = probing_cg.ballot(first_equals);
-        auto const second_exists = probing_cg.ballot(second_equals);
-
-        if (first_exists or second_exists) {
-          if constexpr (is_outer) { found_match = true; }
-
-          auto const num_first_matches  = __popc(first_exists);
-          auto const num_second_matches = __popc(second_exists);
-
-          uint32_t output_idx;
-          if (0 == cg_lane_id) {
-            output_idx = atomicAdd(flushing_cg_counter, (num_first_matches + num_second_matches));
-          }
-          output_idx = probing_cg.shfl(output_idx, 0);
-
-          if (first_equals) {
-            auto const lane_offset = detail::count_least_significant_bits(first_exists, cg_lane_id);
-            probe_output_buffer[output_idx + lane_offset]     = pair;
-            contained_output_buffer[output_idx + lane_offset] = arr[0];
-          }
-          if (second_equals) {
-            auto const lane_offset =
-              detail::count_least_significant_bits(second_exists, cg_lane_id);
-            probe_output_buffer[output_idx + num_first_matches + lane_offset]     = pair;
-            contained_output_buffer[output_idx + num_first_matches + lane_offset] = arr[1];
-          }
-        }
-        if (probing_cg.any(first_slot_is_empty or second_slot_is_empty)) {
-          running = false;
-          if constexpr (is_outer) {
-            if ((not found_match) && (cg_lane_id == 0)) {
-              auto const output_idx           = atomicAdd(flushing_cg_counter, 1);
-              probe_output_buffer[output_idx] = pair;
-              contained_output_buffer[output_idx] =
-                cuco::make_pair(this->get_empty_key_sentinel(), this->get_empty_value_sentinel());
-            }
-          }
-        }
-      }  // if running
-
-      flushing_cg.sync();
-      if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) {
-        flush_output_buffer(flushing_cg,
-                            *flushing_cg_counter,
-                            probe_output_buffer,
-                            contained_output_buffer,
-                            num_matches,
-                            probe_output_begin,
-                            contained_output_begin);
-        // Everyone in the group reads the counter when flushing, so
-        // sync before writing.
-        flushing_cg.sync();
-        // First lane reset warp-level counter
-        if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; }
-        flushing_cg.sync();
-      }
-
-      current_slot = next_slot(current_slot);
-    }  // while running
-  }
-
-  /**
-   * @brief Retrieves all the matches of a given pair contained in multimap using scalar
-   * loads with per-CG shared memory buffer.
-   *
-   * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations
-   * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in
-   * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies
-   * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output only if
-   * `is_outer` is true.
-   *
-   * @tparam buffer_size Size of the output buffer
-   * @tparam is_outer Boolean flag indicating whether outer join is peformed
-   * @tparam CG Cooperative Group type
-   * @tparam atomicT Type of atomic storage
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `InputIt`s `value_type`.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `value_type`.
-   * @tparam PairEqual Binary callable type
-   * @param g The Cooperative Group used to retrieve
-   * @param pair The pair to search for
-   * @param cg_counter Pointer to the CG counter
-   * @param probe_output_buffer Buffer of the matched probe pair sequence
-   * @param contained_output_buffer Buffer of the matched contained pair sequence
-   * @param num_matches Size of the output sequence
-   * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-   * @param contained_output_begin Beginning of the output sequence of the matched contained
-   * pairs
-   * @param pair_equal The binary callable used to compare two pairs for equality
-   */
-  template <uint32_t buffer_size,
-            bool is_outer,
-            typename CG,
-            typename atomicT,
-            typename OutputIt1,
-            typename OutputIt2,
-            typename PairEqual>
-  __device__ __forceinline__ void pair_retrieve(CG g,
-                                                value_type const& pair,
-                                                uint32_t* cg_counter,
-                                                value_type* probe_output_buffer,
-                                                value_type* contained_output_buffer,
-                                                atomicT* num_matches,
-                                                OutputIt1 probe_output_begin,
-                                                OutputIt2 contained_output_begin,
-                                                PairEqual pair_equal) noexcept
-  {
-    const uint32_t lane_id = g.thread_rank();
-
-    auto key          = pair.first;
-    auto current_slot = initial_slot(g, key);
-
-    bool running                      = true;
-    [[maybe_unused]] bool found_match = false;
-
-    while (running) {
-      // TODO: Replace reinterpret_cast with atomic ref when possible. The current implementation
-      // is unsafe!
-      static_assert(sizeof(Key) == sizeof(cuda::atomic<Key>));
-      static_assert(sizeof(Value) == sizeof(cuda::atomic<Value>));
-      value_type slot_contents = *reinterpret_cast<value_type const*>(current_slot);
-
-      auto const slot_is_empty =
-        detail::bitwise_compare(slot_contents.first, this->get_empty_key_sentinel());
-      auto const equals = (not slot_is_empty and pair_equal(slot_contents, pair));
-      auto const exists = g.ballot(equals);
-
-      uint32_t output_idx = *cg_counter;
-
-      if (exists) {
-        if constexpr (is_outer) { found_match = true; }
-        auto const num_matches = __popc(exists);
-        if (equals) {
-          // Each match computes its lane-level offset
-          auto const lane_offset = detail::count_least_significant_bits(exists, lane_id);
-          probe_output_buffer[output_idx + lane_offset]     = pair;
-          contained_output_buffer[output_idx + lane_offset] = slot_contents;
-        }
-        if (0 == lane_id) { (*cg_counter) += num_matches; }
-      }
-      if (g.any(slot_is_empty)) {
-        running = false;
-        if constexpr (is_outer) {
-          if ((not found_match) && (lane_id == 0)) {
-            output_idx                      = (*cg_counter)++;
-            probe_output_buffer[output_idx] = pair;
-            contained_output_buffer[output_idx] =
-              cuco::make_pair(this->get_empty_key_sentinel(), this->get_empty_value_sentinel());
-          }
-        }
-      }
-
-      g.sync();
-
-      // Flush if the next iteration won't fit into buffer
-      if ((*cg_counter + g.size()) > buffer_size) {
-        flush_output_buffer(g,
-                            *cg_counter,
-                            probe_output_buffer,
-                            contained_output_buffer,
-                            num_matches,
-                            probe_output_begin,
-                            contained_output_begin);
-        // Everyone in the group reads the counter when flushing, so
-        // sync before writing.
-        g.sync();
-        // First lane reset CG-level counter
-        if (lane_id == 0) { *cg_counter = 0; }
-        g.sync();
-      }
-      current_slot = next_slot(current_slot);
-    }  // while running
-  }
-};  // class device_view_impl
-
-}  // namespace cuco
diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh
deleted file mode 100644
index 1e7b9d985..000000000
--- a/include/cuco/detail/static_multimap/kernels.cuh
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/detail/utility/cuda.cuh>
-#include <cuco/pair.cuh>
-
-#include <cub/block/block_reduce.cuh>
-#include <cuda/std/atomic>
-#include <cuda/std/iterator>
-#include <thrust/type_traits/is_contiguous_iterator.h>
-
-namespace cuco {
-namespace detail {
-namespace cg = cooperative_groups;
-
-CUCO_SUPPRESS_KERNEL_WARNINGS
-/**
- * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`.
- *
- * Each space in `slots` that can hold a key value pair is initialized to a
- * `pair_atomic_type` containing the key `k` and the value `v`.
- *
- * @tparam atomic_key_type Type of the `Key` atomic container
- * @tparam atomic_mapped_type Type of the `Value` atomic container
- * @tparam Key key type
- * @tparam Value value type
- * @tparam pair_atomic_type key/value pair type
- *
- * @param slots Pointer to flat storage for the map's key/value pairs
- * @param k Key to which all keys in `slots` are initialized
- * @param v Value to which all values in `slots` are initialized
- * @param size Size of the storage pointed to by `slots`
- */
-template <typename atomic_key_type,
-          typename atomic_mapped_type,
-          typename Key,
-          typename Value,
-          typename pair_atomic_type>
-CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
-{
-  int64_t const loop_stride = gridDim.x * blockDim.x;
-  int64_t idx               = threadIdx.x + blockIdx.x * blockDim.x;
-  while (idx < size) {
-    new (&slots[idx].first) atomic_key_type{k};
-    new (&slots[idx].second) atomic_mapped_type{v};
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Inserts all key/value pairs in the range `[first, last)`.
- *
- * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform each
- * key/value insertion. This provides a significant boost in throughput compared to the non
- * Cooperative Group `insert` at moderate to high load factors.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam InputIt Device accessible random access input iterator where
- * `std::is_convertible<std::iterator_traits<InputIt>::value_type,
- * static_multimap<K, V>::value_type>` is `true`
- * @tparam viewT Type of device view allowing access of hash map storage
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param n Number of key/value pairs to insert
- * @param view Mutable device view used to access the hash map's slot storage
- */
-template <uint32_t block_size, uint32_t tile_size, typename InputIt, typename viewT>
-CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  while (idx < n) {
-    // force conversion to value_type
-    typename viewT::value_type const insert_pair{*(first + idx)};
-    view.insert(tile, insert_pair);
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred` of the
- * corresponding stencil returns true.
- *
- * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
- *
- * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform each
- * key/value insertion. This provides a significant boost in throughput compared to the non
- * Cooperative Group `insert` at moderate to high load factors.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform
- * inserts
- * @tparam InputIt Device accessible random access input iterator where
- * `std::is_convertible<std::iterator_traits<InputIt>::value_type,
- * static_multimap<K, V>::value_type>` is `true`
- * @tparam StencilIt Device accessible random access iterator whose value_type is
- * convertible to Predicate's argument type
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
- * argument type is convertible from `std::iterator_traits<StencilIt>::value_type`.
- *
- * @param first Beginning of the sequence of key/value pairs
- * @param s Beginning of the stencil sequence
- * @param n Number of elements to insert
- * @param view Mutable device view used to access the hash map's slot storage
- * @param pred Predicate to test on every element in the range `[s, s + n)`
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename StencilIt,
-          typename viewT,
-          typename Predicate>
-CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  while (idx < n) {
-    if (pred(*(s + idx))) {
-      typename viewT::value_type const insert_pair{*(first + idx)};
-      // force conversion to value_type
-      view.insert(tile, insert_pair);
-    }
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Indicates whether the elements in the range `[first, last)` are contained in the map.
- *
- * Stores `true` or `false` to `(output + i)` indicating if the element `*(first + i)` exists in the
- * map.
- *
- * Uses the CUDA Cooperative Groups API to leverage groups of multiple threads to perform the
- * contains operation for each element. This provides a significant boost in throughput compared
- * to the non Cooperative Group `contains` at moderate to high load factors.
- *
- * @tparam is_pair_contains `true` if it's a `pair_contains` implementation
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups
- * @tparam InputIt Device accessible input iterator
- * @tparam OutputIt Device accessible output iterator assignable from `bool`
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam Equal Binary callable type
- *
- * @param first Beginning of the sequence of elements
- * @param n Number of elements to query
- * @param output_begin Beginning of the sequence of booleans for the presence of each element
- * @param view Device view used to access the hash map's slot storage
- * @param equal The binary function to compare input element and slot content for equality
- */
-template <bool is_pair_contains,
-          uint32_t block_size,
-          uint32_t tile_size,
-          typename InputIt,
-          typename OutputIt,
-          typename viewT,
-          typename Equal>
-CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-  __shared__ bool writeBuffer[block_size / tile_size];
-
-  while (idx < n) {
-    typename cuda::std::iterator_traits<InputIt>::value_type element = *(first + idx);
-    auto found                                                       = [&]() {
-      if constexpr (is_pair_contains) { return view.pair_contains(tile, element, equal); }
-      if constexpr (not is_pair_contains) { return view.contains(tile, element, equal); }
-    }();
-
-    /*
-     * The ld.relaxed.gpu instruction used in view.find causes L1 to
-     * flush more frequently, causing increased sector stores from L2 to global memory.
-     * By writing results to shared memory and then synchronizing before writing back
-     * to global, we no longer rely on L1, preventing the increase in sector stores from
-     * L2 to global and improving performance.
-     */
-    if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
-    __syncthreads();
-    if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
-    idx += loop_stride;
-  }
-}
-
-/**
- * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap.
- *
- * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by `key_equal(k,
- * k')` and stores the sum of all matches for all keys to `num_matches`. If `k` does not have any
- * matches, it contributes 1 to the final sum only if `is_outer` is true.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform counts
- * @tparam uses_vector_load Boolean flag indicating whether vector loads are used or not
- * @tparam is_outer Boolean flag indicating whether non-matches are counted
- * @tparam InputIt Device accessible input iterator whose `value_type` is convertible to the map's
- * `key_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam KeyEqual Binary callable
- *
- * @param first Beginning of the sequence of keys to count
- * @param n Number of the keys to query
- * @param num_matches The number of all the matches for a sequence of keys
- * @param view Device view used to access the hash map's slot storage
- * @param key_equal Binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          bool is_outer,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename KeyEqual>
-CUCO_KERNEL void count(
-  InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_matches = 0;
-
-  while (idx < n) {
-    auto key = *(first + idx);
-    if constexpr (is_outer) {
-      thread_num_matches += view.count_outer(tile, key, key_equal);
-    } else {
-      thread_num_matches += view.count(tile, key, key_equal);
-    }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_matches = BlockReduce(temp_storage).Sum(thread_num_matches);
-  if (threadIdx.x == 0) {
-    num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap.
- *
- * For pair, `p = *(first + i)`, counts all matching pairs, `p'`, as determined by `pair_equal(p,
- * p')` and stores the sum of all matches for all pairs to `num_matches`. If `p` does not have any
- * matches, it contributes 1 to the final sum only if `is_outer` is true.
- *
- * @tparam block_size The size of the thread block
- * @tparam tile_size The number of threads in the Cooperative Groups used to perform counts
- * @tparam is_outer Boolean flag indicating whether non-matches are counted
- * @tparam InputIt Device accessible random access input iterator where
- * `std::is_convertible<std::iterator_traits<InputIt>::value_type,
- * static_multimap<K, V>::value_type>` is `true`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam PairEqual Binary callable
- *
- * @param first Beginning of the sequence of pairs to count
- * @param n Number of the pairs to query
- * @param num_matches The number of all the matches for a sequence of pairs
- * @param view Device view used to access the hash map's slot storage
- * @param pair_equal Binary function to compare two pairs for equality
- */
-template <uint32_t block_size,
-          uint32_t tile_size,
-          bool is_outer,
-          typename InputIt,
-          typename atomicT,
-          typename viewT,
-          typename PairEqual>
-CUCO_KERNEL void pair_count(
-  InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal)
-{
-  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / tile_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
-
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t thread_num_matches = 0;
-
-  while (idx < n) {
-    typename viewT::value_type const pair = *(first + idx);
-    if constexpr (is_outer) {
-      thread_num_matches += view.pair_count_outer(tile, pair, pair_equal);
-    } else {
-      thread_num_matches += view.pair_count(tile, pair, pair_equal);
-    }
-    idx += loop_stride;
-  }
-
-  // compute number of successfully inserted elements for each block
-  // and atomically add to the grand total
-  std::size_t block_num_matches = BlockReduce(temp_storage).Sum(thread_num_matches);
-  if (threadIdx.x == 0) {
-    num_matches->fetch_add(block_num_matches, cuda::std::memory_order_relaxed);
-  }
-}
-
-/**
- * @brief Retrieves all the values corresponding to all keys in the range `[first, last)`.
- *
- * For key `k = *(first + i)` existing in the map, copies `k` and all associated values to
- * unspecified locations in `[output_begin, output_end)`. If `k` does not have any matches, copies
- * `k` and `empty_value_sentinel()` into the output only if `is_outer` is true.
- *
- * Behavior is undefined if the total number of matching keys exceeds `std::distance(output_begin,
- * output_begin + *num_matches - 1)`. Use `count()` to determine the size of the output range.
- *
- * @tparam block_size The size of the thread block
- * @tparam flushing_cg_size The size of the CG used to flush output buffers
- * @tparam probing_cg_size The size of the CG for parallel retrievals
- * @tparam buffer_size Size of the output buffer
- * @tparam is_outer Boolean flag indicating whether non-matches are included in the output
- * @tparam InputIt Device accessible input iterator whose `value_type` is
- * convertible to the map's `key_type`
- * @tparam OutputIt Device accessible output iterator whose `value_type` is
- * constructible from the map's `value_type`
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam KeyEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of the keys to query
- * @param output_begin Beginning of the sequence of values retrieved for each key
- * @param num_matches Size of the output sequence
- * @param view Device view used to access the hash map's slot storage
- * @param key_equal The binary function to compare two keys for equality
- */
-template <uint32_t block_size,
-          uint32_t flushing_cg_size,
-          uint32_t probing_cg_size,
-          uint32_t buffer_size,
-          bool is_outer,
-          typename InputIt,
-          typename OutputIt,
-          typename atomicT,
-          typename viewT,
-          typename KeyEqual>
-CUCO_KERNEL void retrieve(InputIt first,
-                          int64_t n,
-                          OutputIt output_begin,
-                          atomicT* num_matches,
-                          viewT view,
-                          KeyEqual key_equal)
-{
-  using pair_type = typename viewT::value_type;
-
-  constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
-  const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
-
-  auto flushing_cg =
-    cg::tiled_partition<flushing_cg_size, cg::thread_block>(cg::this_thread_block());
-  auto probing_cg = cg::tiled_partition<probing_cg_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
-
-  __shared__ pair_type output_buffer[num_flushing_cgs][buffer_size];
-  // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization
-  // warning issue is solved __shared__ atomicT counter[num_flushing_cgs][buffer_size];
-  __shared__ uint32_t flushing_cg_counter[num_flushing_cgs];
-
-  if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; }
-
-  flushing_cg.sync();
-
-  while (flushing_cg.any(idx < n)) {
-    bool active_flag        = idx < n;
-    auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag);
-
-    if (active_flag) {
-      auto key = *(first + idx);
-      if constexpr (is_outer) {
-        view.template retrieve_outer<buffer_size>(active_flushing_cg,
-                                                  probing_cg,
-                                                  key,
-                                                  &flushing_cg_counter[flushing_cg_id],
-                                                  output_buffer[flushing_cg_id],
-                                                  num_matches,
-                                                  output_begin,
-                                                  key_equal);
-      } else {
-        view.template retrieve<buffer_size>(active_flushing_cg,
-                                            probing_cg,
-                                            key,
-                                            &flushing_cg_counter[flushing_cg_id],
-                                            output_buffer[flushing_cg_id],
-                                            num_matches,
-                                            output_begin,
-                                            key_equal);
-      }
-    }
-    idx += loop_stride;
-  }
-
-  flushing_cg.sync();
-  // Final flush of output buffer
-  if (flushing_cg_counter[flushing_cg_id] > 0) {
-    view.flush_output_buffer(flushing_cg,
-                             flushing_cg_counter[flushing_cg_id],
-                             output_buffer[flushing_cg_id],
-                             num_matches,
-                             output_begin);
-  }
-}
-
-/**
- * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`.
- *
- * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to unspecified
- * locations in `probe_output_begin`, and slot[j] is stored to unspecified locations in
- * `contained_output_begin`. If the given pair has no matches in the map, copies *(first + i) in
- * `probe_output_begin` and a pair of `empty_key_sentinel` and `empty_value_sentinel` in
- * `contained_output_begin` only when `is_outer` is `true`.
- *
- * Behavior is undefined if the total number of matching pairs exceeds `std::distance(output_begin,
- * output_begin + *num_matches - 1)`. Use `pair_count()` to determine the size of the output range.
- *
- * @tparam block_size The size of the thread block
- * @tparam flushing_cg_size The size of the CG used to flush output buffers
- * @tparam probing_cg_size The size of the CG for parallel retrievals
- * @tparam buffer_size Size of the output buffer
- * @tparam is_outer Boolean flag indicating whether non-matches are included in the output
- * @tparam InputIt Device accessible random access input iterator where
- * `std::is_convertible<std::iterator_traits<InputIt>::value_type,
- * static_multimap<K, V>::value_type>` is `true`
- * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
- * `InputIt`s `value_type`.
- * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
- * the map's `value_type`.
- * @tparam atomicT Type of atomic storage
- * @tparam viewT Type of device view allowing access of hash map storage
- * @tparam PairEqual Binary callable type
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param probe_output_begin Beginning of the sequence of the matched probe pairs
- * @param contained_output_begin Beginning of the sequence of the matched contained pairs
- * @param num_matches Size of the output sequence
- * @param view Device view used to access the hash map's slot storage
- * @param pair_equal The binary function to compare two pairs for equality
- */
-template <uint32_t block_size,
-          uint32_t flushing_cg_size,
-          uint32_t probing_cg_size,
-          uint32_t buffer_size,
-          bool is_outer,
-          typename InputIt,
-          typename OutputIt1,
-          typename OutputIt2,
-          typename atomicT,
-          typename viewT,
-          typename PairEqual>
-CUCO_KERNEL void pair_retrieve(InputIt first,
-                               int64_t n,
-                               OutputIt1 probe_output_begin,
-                               OutputIt2 contained_output_begin,
-                               atomicT* num_matches,
-                               viewT view,
-                               PairEqual pair_equal)
-{
-  using pair_type = typename viewT::value_type;
-
-  constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
-  const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
-
-  auto flushing_cg =
-    cg::tiled_partition<flushing_cg_size, cg::thread_block>(cg::this_thread_block());
-  auto probing_cg = cg::tiled_partition<probing_cg_size, cg::thread_block>(cg::this_thread_block());
-  int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
-  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
-
-  __shared__ pair_type probe_output_buffer[num_flushing_cgs][buffer_size];
-  __shared__ pair_type contained_output_buffer[num_flushing_cgs][buffer_size];
-  // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization
-  // warning issue is solved __shared__ atomicT counter[num_flushing_cgs][buffer_size];
-  __shared__ uint32_t flushing_cg_counter[num_flushing_cgs];
-
-  if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; }
-
-  flushing_cg.sync();
-
-  while (flushing_cg.any(idx < n)) {
-    bool active_flag        = idx < n;
-    auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag);
-
-    if (active_flag) {
-      pair_type pair = *(first + idx);
-      if constexpr (is_outer) {
-        view.pair_retrieve_outer<buffer_size>(active_flushing_cg,
-                                              probing_cg,
-                                              pair,
-                                              &flushing_cg_counter[flushing_cg_id],
-                                              probe_output_buffer[flushing_cg_id],
-                                              contained_output_buffer[flushing_cg_id],
-                                              num_matches,
-                                              probe_output_begin,
-                                              contained_output_begin,
-                                              pair_equal);
-      } else {
-        view.pair_retrieve<buffer_size>(active_flushing_cg,
-                                        probing_cg,
-                                        pair,
-                                        &flushing_cg_counter[flushing_cg_id],
-                                        probe_output_buffer[flushing_cg_id],
-                                        contained_output_buffer[flushing_cg_id],
-                                        num_matches,
-                                        probe_output_begin,
-                                        contained_output_begin,
-                                        pair_equal);
-      }
-    }
-    idx += loop_stride;
-  }
-
-  flushing_cg.sync();
-  // Final flush of output buffer
-  if (flushing_cg_counter[flushing_cg_id] > 0) {
-    view.flush_output_buffer(flushing_cg,
-                             flushing_cg_counter[flushing_cg_id],
-                             probe_output_buffer[flushing_cg_id],
-                             contained_output_buffer[flushing_cg_id],
-                             num_matches,
-                             probe_output_begin,
-                             contained_output_begin);
-  }
-}
-}  // namespace detail
-}  // namespace cuco
diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
index 39e23f671..f0b3d4330 100644
--- a/include/cuco/detail/static_multimap/static_multimap.inl
+++ b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
-#include <cuco/detail/utils.cuh>
 
 #include <cuda/std/tuple>
 #include <thrust/count.h>
@@ -26,7 +26,6 @@
 #include <iterator>
 
 namespace cuco {
-namespace experimental {
 template <class Key,
           class T,
           class Extent,
@@ -769,858 +768,5 @@ auto static_multimap<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
                                     cuda_thread_scope<Scope>{},
                                     impl_->storage_ref()};
 }
-}  // namespace experimental
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::static_multimap(
-  std::size_t capacity,
-  empty_key<Key> empty_key_sentinel,
-  empty_value<Value> empty_value_sentinel,
-  cudaStream_t stream,
-  Allocator const& alloc)
-  : capacity_{cuco::detail::get_valid_capacity<cg_size(), vector_width(), uses_vector_load()>(
-      capacity)},
-    empty_key_sentinel_{empty_key_sentinel.value},
-    empty_value_sentinel_{empty_value_sentinel.value},
-    allocator_{alloc},
-    delete_slots_{allocator_, capacity_, cuda::stream_ref{stream}},
-    slots_{allocator_.allocate(capacity_, cuda::stream_ref{stream}), delete_slots_}
-{
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 4;
-  auto const grid_size      = (get_capacity() + stride * block_size - 1) / (stride * block_size);
-
-  detail::initialize<atomic_key_type, atomic_mapped_type><<<grid_size, block_size, 0, stream>>>(
-    slots_.get(), empty_key_sentinel_, empty_value_sentinel_, get_capacity());
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt>
-void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::insert(InputIt first,
-                                                                          InputIt last,
-                                                                          cudaStream_t stream)
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_mutable_view();
-
-  detail::insert<block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, view);
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename StencilIt, typename Predicate>
-void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::insert_if(
-  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream)
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_mutable_view();
-
-  detail::insert_if_n<block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, stencil, num_keys, view, pred);
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt, typename KeyEqual>
-void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return; }
-
-  auto constexpr is_pair_contains = false;
-  auto constexpr block_size       = 128;
-  auto constexpr stride           = 1;
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_view();
-
-  detail::contains<is_pair_contains, block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, key_equal);
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt, typename PairEqual>
-void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_contains(
-  InputIt first, InputIt last, OutputIt output_begin, PairEqual pair_equal, cudaStream_t stream)
-  const
-{
-  auto const num_pairs = cuco::detail::distance(first, last);
-  if (num_pairs == 0) { return; }
-
-  auto constexpr is_pair_contains = true;
-  auto constexpr block_size       = 128;
-  auto constexpr stride           = 1;
-  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
-  auto view            = get_device_view();
-
-  detail::contains<is_pair_contains, block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, output_begin, view, pair_equal);
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename KeyEqual>
-std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
-  InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return 0; }
-
-  auto constexpr is_outer   = false;
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-
-  auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);
-
-  return counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename KeyEqual>
-std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_outer(
-  InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return 0; }
-
-  auto constexpr is_outer   = true;
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-
-  auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);
-
-  return counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename PairEqual>
-std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_count(
-  InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const
-{
-  auto const num_pairs = cuco::detail::distance(first, last);
-  if (num_pairs == 0) { return 0; }
-
-  auto constexpr is_outer   = false;
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-
-  auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);
-
-  return counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename PairEqual>
-std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_count_outer(
-  InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const
-{
-  auto const num_pairs = cuco::detail::distance(first, last);
-  if (num_pairs == 0) { return 0; }
-
-  auto constexpr is_outer   = true;
-  auto constexpr block_size = 128;
-  auto constexpr stride     = 1;
-
-  auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);
-
-  return counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt, typename KeyEqual>
-OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
-  InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return output_begin; }
-
-  // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
-  constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto is_outer    = false;
-
-  auto view                   = get_device_view();
-  auto const flushing_cg_size = [&]() {
-    if constexpr (uses_vector_load()) { return warp_size(); }
-    return cg_size();
-  }();
-
-  auto const grid_size = detail::grid_size(num_keys, cg_size());
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, counter.data(), view, key_equal);
-
-  return output_begin + counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt, typename KeyEqual>
-OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_outer(
-  InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const
-{
-  auto const num_keys = cuco::detail::distance(first, last);
-  if (num_keys == 0) { return output_begin; }
-
-  // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
-  constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto is_outer    = true;
-
-  auto view                   = get_device_view();
-  auto const flushing_cg_size = [&]() {
-    if constexpr (uses_vector_load()) { return warp_size(); }
-    return cg_size();
-  }();
-
-  auto const grid_size = detail::grid_size(num_keys, cg_size());
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, counter.data(), view, key_equal);
-
-  return output_begin + counter.load_to_host(stream);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt1, typename OutputIt2, typename PairEqual>
-std::pair<OutputIt1, OutputIt2>
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
-  InputIt first,
-  InputIt last,
-  OutputIt1 probe_output_begin,
-  OutputIt2 contained_output_begin,
-  PairEqual pair_equal,
-  cudaStream_t stream) const
-{
-  auto const num_pairs = cuco::detail::distance(first, last);
-  if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); }
-
-  // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
-  constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto block_size  = 128;
-  constexpr auto is_outer    = false;
-  constexpr auto stride      = 1;
-
-  auto view                   = get_device_view();
-  auto const flushing_cg_size = [&]() {
-    if constexpr (uses_vector_load()) { return warp_size(); }
-    return cg_size();
-  }();
-  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first,
-                                           num_pairs,
-                                           probe_output_begin,
-                                           contained_output_begin,
-                                           counter.data(),
-                                           view,
-                                           pair_equal);
-
-  auto const h_count = counter.load_to_host(stream);
-  return {probe_output_begin + h_count, contained_output_begin + h_count};
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename InputIt, typename OutputIt1, typename OutputIt2, typename PairEqual>
-std::pair<OutputIt1, OutputIt2>
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_outer(
-  InputIt first,
-  InputIt last,
-  OutputIt1 probe_output_begin,
-  OutputIt2 contained_output_begin,
-  PairEqual pair_equal,
-  cudaStream_t stream) const
-{
-  auto const num_pairs = cuco::detail::distance(first, last);
-  if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); }
-
-  // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
-  constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto block_size  = 128;
-  constexpr auto is_outer    = true;
-  constexpr auto stride      = 1;
-
-  auto view                   = get_device_view();
-  auto const flushing_cg_size = [&]() {
-    if constexpr (uses_vector_load()) { return warp_size(); }
-    return cg_size();
-  }();
-  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
-
-  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_, stream};
-  counter.reset(stream);
-
-  detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first,
-                                           num_pairs,
-                                           probe_output_begin,
-                                           contained_output_begin,
-                                           counter.data(),
-                                           view,
-                                           pair_equal);
-
-  auto const h_count = counter.load_to_host(stream);
-  return {probe_output_begin + h_count, contained_output_begin + h_count};
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_mutable_view::insert(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  value_type const& insert_pair) noexcept
-{
-  impl_.template insert<uses_vector_load()>(g, insert_pair);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename CG>
-__device__ __forceinline__ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::make_copy(
-  CG g, pair_atomic_type* const memory_to_use, device_view source_device_view) noexcept
-{
-#if defined(CUCO_HAS_CUDA_BARRIER)
-  __shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> barrier;
-  if (g.thread_rank() == 0) { init(&barrier, g.size()); }
-  g.sync();
-
-  cuda::memcpy_async(g,
-                     memory_to_use,
-                     source_device_view.get_slots(),
-                     sizeof(pair_atomic_type) * source_device_view.get_capacity(),
-                     barrier);
-
-  barrier.arrive_and_wait();
-#else
-  pair_atomic_type const* const slots_ptr = source_device_view.get_slots();
-  for (std::size_t i = g.thread_rank(); i < source_device_view.get_capacity(); i += g.size()) {
-    new (&memory_to_use[i].first)
-      atomic_key_type{slots_ptr[i].first.load(cuda::memory_order_relaxed)};
-    new (&memory_to_use[i].second)
-      atomic_mapped_type{slots_ptr[i].second.load(cuda::memory_order_relaxed)};
-  }
-  g.sync();
-#endif
-
-  return device_view(memory_to_use,
-                     source_device_view.get_capacity(),
-                     source_device_view.get_empty_key_sentinel(),
-                     source_device_view.get_empty_value_sentinel());
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename CG, typename atomicT, typename OutputIt>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::flush_output_buffer(
-  CG g,
-  uint32_t const num_outputs,
-  value_type* output_buffer,
-  atomicT* num_matches,
-  OutputIt output_begin) noexcept
-{
-  impl_.flush_output_buffer(g, num_outputs, output_buffer, num_matches, output_begin);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::flush_output_buffer(
-  CG g,
-  uint32_t const num_outputs,
-  value_type* probe_output_buffer,
-  value_type* contained_output_buffer,
-  atomicT* num_matches,
-  OutputIt1 probe_output_begin,
-  OutputIt2 contained_output_begin) noexcept
-{
-  impl_.flush_output_buffer(g,
-                            num_outputs,
-                            probe_output_buffer,
-                            contained_output_buffer,
-                            num_matches,
-                            probe_output_begin,
-                            contained_output_begin);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename ProbeKey, typename KeyEqual, typename ParentCG>
-__device__ __forceinline__ bool
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::contains(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  ProbeKey const& k,
-  KeyEqual key_equal) const noexcept
-{
-  constexpr bool is_pair_contains = false;
-  return impl_.template contains<is_pair_contains, uses_vector_load()>(g, k, key_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename ProbePair, typename PairEqual, typename ParentCG>
-__device__ __forceinline__ bool
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_contains(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  ProbePair const& p,
-  PairEqual pair_equal) const noexcept
-{
-  constexpr bool is_pair_contains = true;
-  return impl_.template contains<is_pair_contains, uses_vector_load()>(g, p, pair_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename KeyEqual, typename ParentCG>
-__device__ __forceinline__ std::size_t
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::count(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  Key const& k,
-  KeyEqual key_equal) noexcept
-{
-  constexpr bool is_outer = false;
-  return impl_.count<uses_vector_load(), is_outer>(g, k, key_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename KeyEqual, typename ParentCG>
-__device__ __forceinline__ std::size_t
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::count_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  Key const& k,
-  KeyEqual key_equal) noexcept
-{
-  constexpr bool is_outer = true;
-  return impl_.count<uses_vector_load(), is_outer>(g, k, key_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename PairEqual, typename ParentCG>
-__device__ __forceinline__ std::size_t
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_count(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  value_type const& pair,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = false;
-  return impl_.pair_count<uses_vector_load(), is_outer>(g, pair, pair_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename PairEqual, typename ParentCG>
-__device__ __forceinline__ std::size_t
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_count_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-  value_type const& pair,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = true;
-  return impl_.pair_count<uses_vector_load(), is_outer>(g, pair, pair_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <uint32_t buffer_size,
-          typename FlushingCG,
-          typename atomicT,
-          typename OutputIt,
-          typename KeyEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::retrieve(
-  FlushingCG flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  Key const& k,
-  uint32_t* flushing_cg_counter,
-  value_type* output_buffer,
-  atomicT* num_matches,
-  OutputIt output_begin,
-  KeyEqual key_equal) noexcept
-{
-  constexpr bool is_outer = false;
-  if constexpr (uses_vector_load()) {
-    impl_.template retrieve<buffer_size, is_outer>(flushing_cg,
-                                                   probing_cg,
-                                                   k,
-                                                   flushing_cg_counter,
-                                                   output_buffer,
-                                                   num_matches,
-                                                   output_begin,
-                                                   key_equal);
-  } else  // In the case of scalar load, flushing CG is the same as probing CG
-  {
-    impl_.template retrieve<buffer_size, is_outer>(
-      probing_cg, k, flushing_cg_counter, output_buffer, num_matches, output_begin, key_equal);
-  }
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <uint32_t buffer_size,
-          typename FlushingCG,
-          typename atomicT,
-          typename OutputIt,
-          typename KeyEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::retrieve_outer(
-  FlushingCG flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  Key const& k,
-  uint32_t* flushing_cg_counter,
-  value_type* output_buffer,
-  atomicT* num_matches,
-  OutputIt output_begin,
-  KeyEqual key_equal) noexcept
-{
-  constexpr bool is_outer = true;
-  if constexpr (uses_vector_load()) {
-    impl_.template retrieve<buffer_size, is_outer>(flushing_cg,
-                                                   probing_cg,
-                                                   k,
-                                                   flushing_cg_counter,
-                                                   output_buffer,
-                                                   num_matches,
-                                                   output_begin,
-                                                   key_equal);
-  } else  // In the case of scalar load, flushing CG is the same as probing CG
-  {
-    impl_.template retrieve<buffer_size, is_outer>(
-      probing_cg, k, flushing_cg_counter, output_buffer, num_matches, output_begin, key_equal);
-  }
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename OutputIt1,
-          typename OutputIt2,
-          typename OutputIt3,
-          typename OutputIt4,
-          typename PairEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  value_type const& pair,
-  OutputIt1 probe_key_begin,
-  OutputIt2 probe_val_begin,
-  OutputIt3 contained_key_begin,
-  OutputIt4 contained_val_begin,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = false;
-  impl_.pair_retrieve<is_outer, uses_vector_load()>(probing_cg,
-                                                    pair,
-                                                    probe_key_begin,
-                                                    probe_val_begin,
-                                                    contained_key_begin,
-                                                    contained_val_begin,
-                                                    pair_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <uint32_t buffer_size,
-          typename FlushingCG,
-          typename atomicT,
-          typename OutputIt1,
-          typename OutputIt2,
-          typename PairEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve(
-  FlushingCG flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  value_type const& pair,
-  uint32_t* flushing_cg_counter,
-  value_type* probe_output_buffer,
-  value_type* contained_output_buffer,
-  atomicT* num_matches,
-  OutputIt1 probe_output_begin,
-  OutputIt2 contained_output_begin,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = false;
-  if constexpr (uses_vector_load()) {
-    impl_.pair_retrieve<buffer_size, is_outer>(flushing_cg,
-                                               probing_cg,
-                                               pair,
-                                               flushing_cg_counter,
-                                               probe_output_buffer,
-                                               contained_output_buffer,
-                                               num_matches,
-                                               probe_output_begin,
-                                               contained_output_begin,
-                                               pair_equal);
-  } else  // In the case of scalar load, flushing CG is the same as probing CG
-  {
-    impl_.pair_retrieve<buffer_size, is_outer>(probing_cg,
-                                               pair,
-                                               flushing_cg_counter,
-                                               probe_output_buffer,
-                                               contained_output_buffer,
-                                               num_matches,
-                                               probe_output_begin,
-                                               contained_output_begin,
-                                               pair_equal);
-  }
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <typename OutputIt1,
-          typename OutputIt2,
-          typename OutputIt3,
-          typename OutputIt4,
-          typename PairEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  value_type const& pair,
-  OutputIt1 probe_key_begin,
-  OutputIt2 probe_val_begin,
-  OutputIt3 contained_key_begin,
-  OutputIt4 contained_val_begin,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = true;
-  impl_.pair_retrieve<is_outer, uses_vector_load()>(probing_cg,
-                                                    pair,
-                                                    probe_key_begin,
-                                                    probe_val_begin,
-                                                    contained_key_begin,
-                                                    contained_val_begin,
-                                                    pair_equal);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-template <uint32_t buffer_size,
-          typename FlushingCG,
-          typename atomicT,
-          typename OutputIt1,
-          typename OutputIt2,
-          typename PairEqual,
-          typename ParentCG>
-__device__ __forceinline__ void
-static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve_outer(
-  FlushingCG flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-  value_type const& pair,
-  uint32_t* flushing_cg_counter,
-  value_type* probe_output_buffer,
-  value_type* contained_output_buffer,
-  atomicT* num_matches,
-  OutputIt1 probe_output_begin,
-  OutputIt2 contained_output_begin,
-  PairEqual pair_equal) noexcept
-{
-  constexpr bool is_outer = true;
-  if constexpr (uses_vector_load()) {
-    impl_.pair_retrieve<buffer_size, is_outer>(flushing_cg,
-                                               probing_cg,
-                                               pair,
-                                               flushing_cg_counter,
-                                               probe_output_buffer,
-                                               contained_output_buffer,
-                                               num_matches,
-                                               probe_output_begin,
-                                               contained_output_begin,
-                                               pair_equal);
-  } else  // In the case of scalar load, flushing CG is the same as probing CG
-  {
-    impl_.pair_retrieve<buffer_size, is_outer>(probing_cg,
-                                               pair,
-                                               flushing_cg_counter,
-                                               probe_output_buffer,
-                                               contained_output_buffer,
-                                               num_matches,
-                                               probe_output_begin,
-                                               contained_output_begin,
-                                               pair_equal);
-  }
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::get_size(
-  cudaStream_t stream) const noexcept
-{
-  auto begin  = thrust::make_transform_iterator(raw_slots(), detail::slot_to_tuple<Key, Value>{});
-  auto filled = cuco::detail::slot_is_filled<Key>{get_empty_key_sentinel()};
-
-  return thrust::count_if(thrust::cuda::par.on(stream), begin, begin + get_capacity(), filled);
-}
-
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope,
-          typename Allocator,
-          class ProbeSequence>
-float static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::get_load_factor(
-  cudaStream_t stream) const noexcept
-{
-  auto size = get_size(stream);
-  return static_cast<float>(size) / static_cast<float>(capacity_);
-}
 
 }  // namespace cuco
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index 6bda77367..f11f9bed9 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,8 @@
 #include <cuco/detail/bitwise_compare.cuh>
 
 #include <cuda/std/array>
-#include <cuda/std/bit>
 #include <cuda/std/cmath>
 #include <cuda/std/cstdint>
-#include <cuda/std/limits>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
 
diff --git a/include/cuco/probe_sequences.cuh b/include/cuco/probe_sequences.cuh
deleted file mode 100644
index 89840051f..000000000
--- a/include/cuco/probe_sequences.cuh
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/detail/probe_sequence_impl.cuh>
-
-namespace cuco::legacy {
-
-/**
- * @brief Public linear probing scheme class.
- *
- * Linear probing is efficient when few collisions are present. Performance hints:
- * - Use linear probing when collisions are rare. e.g. low occupancy or low multiplicity.
- * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise.
- *
- * `Hash` should be callable object type.
- *
- * @tparam CGSize Size of CUDA Cooperative Groups
- * @tparam Hash Unary callable type
- */
-template <int CGSize, typename Hash>
-class linear_probing : public detail::probe_sequence_base<CGSize> {
- public:
-  using probe_sequence_base_type =
-    detail::probe_sequence_base<CGSize>;  ///< The base probe scheme type
-  using probe_sequence_base_type::cg_size;
-  using probe_sequence_base_type::vector_width;
-
-  /// Type of implementation details
-  template <typename Key, typename Value, cuda::thread_scope Scope>
-  using impl = detail::linear_probing_impl<Key, Value, Scope, vector_width(), CGSize, Hash>;
-};
-
-/**
- *
- * @brief Public double hashing scheme class.
- *
- * Default probe sequence for `cuco::static_multimap`. Double hashing shows superior
- * performance when dealing with high multiplicty and/or high occupancy use cases. Performance
- * hints:
- * - `CGSize` = 1 or 2 when hash map is small (10'000'000 or less), 4 or 8 otherwise.
- *
- * `Hash1` and `Hash2` should be callable object type.
- *
- * @tparam CGSize Size of CUDA Cooperative Groups
- * @tparam Hash1 Unary callable type
- * @tparam Hash2 Unary callable type
- */
-template <int CGSize, typename Hash1, typename Hash2 = Hash1>
-class double_hashing : public detail::probe_sequence_base<CGSize> {
- public:
-  using probe_sequence_base_type =
-    detail::probe_sequence_base<CGSize>;  ///< The base probe scheme type
-  using probe_sequence_base_type::cg_size;
-  using probe_sequence_base_type::vector_width;
-
-  /// Type of implementation details
-  template <typename Key, typename Value, cuda::thread_scope Scope>
-  using impl = detail::double_hashing_impl<Key, Value, Scope, vector_width(), CGSize, Hash1, Hash2>;
-};
-
-}  // namespace cuco::legacy
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index 0fdefdf1c..449e6428a 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,7 @@
 
 #include <cuco/detail/__config>
 #include <cuco/detail/open_addressing/open_addressing_impl.cuh>
-#include <cuco/detail/prime.hpp>
 #include <cuco/hash_functions.cuh>
-#include <cuco/probe_sequences.cuh>
 #include <cuco/static_multimap_ref.cuh>
 #include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
@@ -34,15 +32,10 @@
 #include <cuda/barrier>
 #endif
 
-#include <cooperative_groups.h>
-
 #include <cstddef>
 #include <memory>
-#include <type_traits>
-#include <utility>
 
 namespace cuco {
-namespace experimental {
 /**
  * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports
  * equivalent keys.
@@ -944,1337 +937,7 @@ class static_multimap {
   std::unique_ptr<impl_type> impl_;   ///< Static map implementation
   mapped_type empty_value_sentinel_;  ///< Sentinel value that indicates an empty payload
 };
-}  // namespace experimental
-
-/**
- * @brief A GPU-accelerated, unordered, associative container of key-value pairs that supports
- * equivalent keys.
- *
- * Allows constant time concurrent inserts or concurrent find operations from threads in device
- * code. Concurrent insert/find is allowed only when
- * <tt>static_multimap<Key, Value>::supports_concurrent_insert_find()</tt> is true.
- *
- * Current limitations:
- * - Requires keys and values where `cuco::is_bitwise_comparable_v<T>` is true
- * - Comparisons against the "sentinel" values will always be done with bitwise comparisons
- * Therefore, the objects must have unique, bitwise object representations (e.g., no padding bits).
- * - Does not support erasing keys
- * - Capacity is fixed and will not grow automatically
- * - Requires the user to specify sentinel values for both key and mapped value
- * to indicate empty slots
- * - Concurrent insert/find is only supported when
- * <tt>static_multimap<Key, Value>::supports_concurrent_insert_find()</tt> is true`
- *
- * The `static_multimap` supports two types of operations:
- * - Host-side "bulk" operations
- * - Device-side "singular" operations
- *
- * The host-side bulk operations include `insert`, `contains`, `count`, `retrieve` and their
- * variants. These APIs should be used when there are a large number of keys to insert or lookup in
- * the map. For example, given a range of keys specified by device-accessible iterators, the bulk
- * `insert` function will insert all keys into the map.
- *
- * The singular device-side operations allow individual threads to perform
- * independent operations (e.g. `insert`, etc.) from device code. These
- * operations are accessed through non-owning, trivially copyable "view" types:
- * `device_view` and `device_mutable_view`. The `device_view` class is an
- * immutable view that allows only non-modifying operations such as `count` or
- * `contains`. The `device_mutable_view` class only allows `insert` operations.
- * The two types are separate to prevent erroneous concurrent insert/find
- * operations.
- *
- * By default, when querying for a Key `k` in operations like `count` or `retrieve`, if `k` is not
- * present in the map, it will not contribute to the output. Query APIs with the `_outer` suffix
- * will include non-matching keys in the output. See the relevant API documentation for more
- * information.
- *
- * Typical associative container query APIs like `retrieve` look up values by solely by key, e.g.,
- * `count` for a Key `k` will count all values whose associated key `k'` matches `k` as determined
- * by `key_equal(k, k')`. In some cases, one may want to consider both key _and_ value when
- * determining if a key-value pair should contribute to the output. `static_multimap` supports this
- * use case with APIs prefixed with `pair_`, e.g., `pair_count` is given a key-value pair
- * `{k,v}` and only counts key-value pairs, `{k', v'}`, in the map where `pair_equal({k,v}, {k',
- * v'})` is true. See the relevant API documentation for more information.
- *
- * Example:
- * \code{.cpp}
- * int empty_key_sentinel = -1;
- * int empty_value_sentinel = -1;
- *
- * // Constructs a multimap with 100,000 slots using -1 and -1 as the empty key/value
- * // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys,
- * // for an load factor of 50%.
- * static_multimap<int, int> m{100'000, empty_key_sentinel, empty_value_sentinel};
- *
- * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
- * thrust::device_vector<cuco::pair<int,int>> pairs(50,000);
- * thrust::transform(thrust::make_counting_iterator(0),
- *                   thrust::make_counting_iterator(pairs.size()),
- *                   pairs.begin(),
- *                   []__device__(auto i){ return cuco::pair{i,i}; };
- *
- * // Inserts all pairs into the map
- * m.insert(pairs.begin(), pairs.end());
- *
- * // Get a `device_view` and passes it to a kernel where threads may perform
- * // `contains/count/retrieve` lookups
- * kernel<<<...>>>(m.get_device_view());
- * \endcode
- *
- *
- * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>`
- * @tparam Value Type of the mapped values. Requires `cuco::is_bitwise_comparable_v<Value>`
- * @tparam Scope The scope in which multimap operations will be performed by
- * individual threads
- * @tparam ProbeSequence Probe sequence chosen between `cuco::legacy::linear_probing`
- * and `cuco::legacy::double_hashing`. (see `probe_sequences.cuh`)
- * @tparam Allocator Type of allocator used for device storage
- */
-template <typename Key,
-          typename Value,
-          cuda::thread_scope Scope = cuda::thread_scope_device,
-          typename Allocator       = cuco::cuda_allocator<char>,
-          class ProbeSequence = cuco::legacy::double_hashing<8, cuco::default_hash_function<Key>>>
-class static_multimap {
-  static_assert(
-    cuco::is_bitwise_comparable_v<Key>,
-    "Key type must have unique object representations or have been explicitly declared as safe for "
-    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
-
-  static_assert(
-    cuco::is_bitwise_comparable_v<Value>,
-    "Value type must have unique object representations or have been explicitly declared as safe "
-    "for bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Value>.");
-
-  static_assert(std::is_base_of_v<cuco::legacy::detail::probe_sequence_base<ProbeSequence::cg_size>,
-                                  ProbeSequence>,
-                "ProbeSequence must be a specialization of either cuco::legacy::double_hashing or "
-                "cuco::legacy::linear_probing.");
-
- public:
-  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
-  using key_type           = Key;                               ///< Key type
-  using mapped_type        = Value;                             ///< Type of mapped values
-  using size_type          = std::size_t;                       ///< Size type
-  using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
-  using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
-  using pair_atomic_type =
-    cuco::pair<atomic_key_type,
-               atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
-  using allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
-    pair_atomic_type>;  ///< Type of the allocator to (de)allocate slots
-  using probe_sequence_type =
-    cuco::legacy::detail::probe_sequence<ProbeSequence, Key, Value, Scope>;  ///< Probe scheme type
-
-  static_multimap(static_multimap const&)            = delete;
-  static_multimap& operator=(static_multimap const&) = delete;
-
-  static_multimap(static_multimap&&) = default;  ///< Move constructor
-
-  /**
-   * @brief Replaces the contents of the map with another map.
-   *
-   * @return Reference of the current map object
-   */
-  static_multimap& operator=(static_multimap&&) = default;
-  ~static_multimap()                            = default;
-
-  /**
-   * @brief Indicate if concurrent insert/find is supported for the key/value types.
-   *
-   * @return Boolean indicating if concurrent insert/find is supported.
-   */
-  __host__ __device__ __forceinline__ static constexpr bool
-  supports_concurrent_insert_find() noexcept
-  {
-    return cuco::detail::is_packable<value_type>();
-  }
-
-  /**
-   * @brief The size of the CUDA cooperative thread group.
-   *
-   * @return The CG size.
-   */
-  __host__ __device__ __forceinline__ static constexpr uint32_t cg_size() noexcept
-  {
-    return ProbeSequence::cg_size;
-  }
-
-  /**
-   * @brief Construct a statically-sized map with the specified initial capacity,
-   * sentinel values and CUDA stream.
-   *
-   * The capacity of the map is fixed. Insert operations will not automatically
-   * grow the map. Attempting to insert more unique keys than the capacity of
-   * the map results in undefined behavior.
-   *
-   * Performance begins to degrade significantly beyond a load factor of ~70%.
-   * For best performance, choose a capacity that will keep the load factor
-   * below 70%. E.g., if inserting `N` unique keys, choose a capacity of
-   * `N * (1/0.7)`.
-   *
-   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
-   * undefined behavior results from attempting to insert any key/value pair
-   * that contains either.
-   *
-   * @param capacity The total number of slots in the map
-   * @param empty_key_sentinel The reserved key value for empty slots
-   * @param empty_value_sentinel The reserved mapped value for empty slots
-   * @param stream CUDA stream used to initialize the map
-   * @param alloc Allocator used for allocating device storage
-   */
-  static_multimap(std::size_t capacity,
-                  empty_key<Key> empty_key_sentinel,
-                  empty_value<Value> empty_value_sentinel,
-                  cudaStream_t stream    = 0,
-                  Allocator const& alloc = Allocator{});
-
-  /**
-   * @brief Inserts all key/value pairs in the range `[first, last)`.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stream CUDA stream used for insert
-   */
-  template <typename InputIt>
-  void insert(InputIt first, InputIt last, cudaStream_t stream = 0);
-
-  /**
-   * @brief Inserts key/value pairs in the range `[first, first + n)` if `pred`
-   * of the corresponding stencil returns true.
-   *
-   * The key/value pair `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @tparam StencilIt Device accessible random access iterator whose value_type is
-   * convertible to Predicate's argument type
-   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
-   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>.
-   * @param first Beginning of the sequence of key/value pairs
-   * @param last End of the sequence of key/value pairs
-   * @param stencil Beginning of the stencil sequence
-   * @param pred Predicate to test on every element in the range `[stencil, stencil +
-   * std::distance(first, last))`
-   * @param stream CUDA stream used for insert
-   */
-  template <typename InputIt, typename StencilIt, typename Predicate>
-  void insert_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream = 0);
-
-  /**
-   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
-   *
-   * Stores `true` or `false` to `(output + i)` indicating if the key `*(first + i)` exists in the
-   * map.
-   *
-   * ProbeSequence hashers should be callable with both
-   * <tt>std::iterator_traits<InputIt>::value_type</tt> and Key type.
-   * <tt>std::invoke_result<KeyEqual, std::iterator_traits<InputIt>::value_type, Key></tt> must be
-   * well-formed.
-   *
-   * @tparam InputIt Device accessible input iterator
-   * @tparam OutputIt Device accessible output iterator assignable from `bool`
-   * @tparam KeyEqual Binary callable type used to compare two keys for equality
-   *
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the output sequence indicating whether each key is present
-   * @param key_equal The binary function to compare two keys for equality
-   * @param stream CUDA stream used for contains
-   */
-  template <typename InputIt, typename OutputIt, typename KeyEqual = cuda::std::equal_to<key_type>>
-  void contains(InputIt first,
-                InputIt last,
-                OutputIt output_begin,
-                KeyEqual key_equal  = KeyEqual{},
-                cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Indicates whether the pairs in the range `[first, last)` are contained in the map.
-   *
-   * Stores `true` or `false` to `(output + i)` indicating if the pair `*(first + i)` exists in
-   * the map.
-   *
-   * ProbeSequence hashers should be callable with both
-   * <tt>std::iterator_traits<InputIt>::value_type::first_type</tt>
-   * and Key type. <tt>std::invoke_result<KeyEqual,
-   * std::iterator_traits<InputIt>::value_type::first_type, Key></tt>
-   * must be well-formed.
-   *
-   * @tparam InputIt Device accessible random access input iterator
-   * @tparam OutputIt Device accessible output iterator assignable from `bool`
-   * @tparam PairEqual Binary callable type used to compare input pair and slot content for equality
-   *
-   * @param first Beginning of the sequence of pairs
-   * @param last End of the sequence of pairs
-   * @param output_begin Beginning of the output sequence indicating whether each pair is present
-   * @param pair_equal The binary function to compare input pair and slot content for equality
-   * @param stream CUDA stream used for contains
-   */
-  template <typename InputIt, typename OutputIt, typename PairEqual>
-  void pair_contains(InputIt first,
-                     InputIt last,
-                     OutputIt output_begin,
-                     PairEqual pair_equal,
-                     cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap.
-   *
-   * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by
-   * `key_equal(k, k')` and returns the sum of all matches for all keys.
-   *
-   * @tparam Input Device accessible input iterator whose `value_type` is convertible to `key_type`
-   * @tparam KeyEqual Binary callable
-   * @param first Beginning of the sequence of keys to count
-   * @param last End of the sequence of keys to count
-   * @param stream CUDA stream used for count
-   * @param key_equal Binary function to compare two keys for equality
-   * @return The sum of total occurrences of all keys in `[first, last)`
-   */
-  template <typename InputIt, typename KeyEqual = cuda::std::equal_to<key_type>>
-  std::size_t count(InputIt first,
-                    InputIt last,
-                    cudaStream_t stream = 0,
-                    KeyEqual key_equal  = KeyEqual{}) const;
-
-  /**
-   * @brief Counts the occurrences of keys in `[first, last)` contained in the multimap.
-   *
-   * For each key, `k = *(first + i)`, counts all matching keys, `k'`, as determined by
-   * `key_equal(k, k')` and returns the sum of all matches for all keys. If `k` does not have any
-   * matches, it contributes 1 to the final sum.
-   *
-   * @tparam Input Device accessible input iterator whose `value_type` is convertible to `key_type`
-   * @tparam KeyEqual Binary callable
-   * @param first Beginning of the sequence of keys to count
-   * @param last End of the sequence of keys to count
-   * @param stream CUDA stream used for count_outer
-   * @param key_equal Binary function to compare two keys for equality
-   * @return The sum of total occurrences of all keys in `[first, last)` where keys without matches
-   * are considered to have a single occurrence.
-   */
-  template <typename InputIt, typename KeyEqual = cuda::std::equal_to<key_type>>
-  std::size_t count_outer(InputIt first,
-                          InputIt last,
-                          cudaStream_t stream = 0,
-                          KeyEqual key_equal  = KeyEqual{}) const;
-
-  /**
-   * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap.
-   *
-   * For key-value pair, `kv = *(first + i)`, counts all matching key-value pairs, `kv'`, as
-   * determined by `pair_equal(kv, kv')` and returns the sum of all matches for all key-value pairs.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @tparam PairEqual Binary callable
-   * @param first Beginning of the sequence of pairs to count
-   * @param last End of the sequence of pairs to count
-   * @param pair_equal Binary function to compare two pairs for equality
-   * @param stream CUDA stream used for pair_count
-   * @return The sum of total occurrences of all pairs in `[first, last)`
-   */
-  template <typename InputIt, typename PairEqual>
-  std::size_t pair_count(InputIt first,
-                         InputIt last,
-                         PairEqual pair_equal,
-                         cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Counts the occurrences of key/value pairs in `[first, last)` contained in the multimap.
-   *
-   * For key-value pair, `kv = *(first + i)`, counts all matching key-value pairs, `kv'`, as
-   * determined by `pair_equal(kv, kv')` and returns the sum of all matches for all key-value pairs.
-   * if `kv` does not have any matches, it contributes 1 to the final sum.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @tparam PairEqual Binary callable
-   * @param first Beginning of the sequence of pairs to count
-   * @param last End of the sequence of pairs to count
-   * @param pair_equal Binary function to compare two pairs for equality
-   * @param stream CUDA stream used for pair_count_outer
-   * @return The sum of total occurrences of all pairs in `[first, last)` where a key-value pair
-   * without a match is considered to have a single occurrence
-   */
-  template <typename InputIt, typename PairEqual>
-  std::size_t pair_count_outer(InputIt first,
-                               InputIt last,
-                               PairEqual pair_equal,
-                               cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Retrieves all the values corresponding to all keys in the range `[first, last)`.
-   *
-   * If key `k = *(first + i)` exists in the map, copies `k` and all associated values to
-   * unspecified locations in `[output_begin, output_end)`. Else, does nothing.
-   *
-   * Behavior is undefined if the size of the output range exceeds `std::distance(output_begin,
-   * output_end)`. Use `count()` to determine the size of the output range.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `key_type`
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * constructible from the map's `value_type`
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the sequence of key/value pairs retrieved for each key
-   * @param stream CUDA stream used for retrieve
-   * @param key_equal The binary function to compare two keys for equality
-   * @return The iterator indicating the last valid key/value pairs in the output
-   */
-  template <typename InputIt, typename OutputIt, typename KeyEqual = cuda::std::equal_to<key_type>>
-  OutputIt retrieve(InputIt first,
-                    InputIt last,
-                    OutputIt output_begin,
-                    cudaStream_t stream = 0,
-                    KeyEqual key_equal  = KeyEqual{}) const;
-
-  /**
-   * @brief Retrieves all the matches corresponding to all keys in the range `[first, last)`.
-   *
-   * If key `k = *(first + i)` exists in the map, copies `k` and all associated values to
-   * unspecified locations in `[output_begin, output_end)`. Else, copies `k` and
-   * `empty_value_sentinel`.
-   *
-   * Behavior is undefined if the size of the output range exceeds `std::distance(output_begin,
-   * output_end)`. Use `count_outer()` to determine the size of the output range.
-   *
-   * @tparam InputIt Device accessible input iterator whose `value_type` is
-   * convertible to the map's `key_type`
-   * @tparam OutputIt Device accessible output iterator whose `value_type` is
-   * constructible from the map's `value_type`
-   * @tparam KeyEqual Binary callable type
-   * @param first Beginning of the sequence of keys
-   * @param last End of the sequence of keys
-   * @param output_begin Beginning of the sequence of key/value pairs retrieved for each key
-   * @param stream CUDA stream used for retrieve_outer
-   * @param key_equal The binary function to compare two keys for equality
-   * @return The iterator indicating the last valid key/value pairs in the output
-   */
-  template <typename InputIt, typename OutputIt, typename KeyEqual = cuda::std::equal_to<key_type>>
-  OutputIt retrieve_outer(InputIt first,
-                          InputIt last,
-                          OutputIt output_begin,
-                          cudaStream_t stream = 0,
-                          KeyEqual key_equal  = KeyEqual{}) const;
-
-  /**
-   * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`.
-   *
-   * The `pair_` prefix indicates that the input data type is convertible to the map's
-   * `value_type`. If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is
-   * stored to `probe_output_begin`, and slot[j] is stored to `contained_output_begin`.
-   *
-   * Behavior is undefined if the size of the output range exceeds
-   * `std::distance(probe_output_begin, probe_output_end)` (or
-   * `std::distance(contained_output_begin, contained_output_end)`). Use
-   * `pair_count()` to determine the size of the output range.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `InputIt`s `value_type`.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `value_type`.
-   * @tparam PairEqual Binary callable type
-   * @param first Beginning of the sequence of pairs
-   * @param last End of the sequence of pairs
-   * @param probe_output_begin Beginning of the sequence of the matched probe pairs
-   * @param contained_output_begin Beginning of the sequence of the matched contained pairs
-   * @param pair_equal The binary function to compare two pairs for equality
-   * @param stream CUDA stream used for pair_retrieve
-   * @return Pair of iterators pointing to the last elements in the output
-   */
-  template <typename InputIt, typename OutputIt1, typename OutputIt2, typename PairEqual>
-  std::pair<OutputIt1, OutputIt2> pair_retrieve(InputIt first,
-                                                InputIt last,
-                                                OutputIt1 probe_output_begin,
-                                                OutputIt2 contained_output_begin,
-                                                PairEqual pair_equal,
-                                                cudaStream_t stream = 0) const;
-
-  /**
-   * @brief Retrieves all pairs matching the input probe pair in the range `[first, last)`.
-   *
-   * The `pair_` prefix indicates that the input data type is convertible to the map's `value_type`.
-   * If pair_equal(*(first + i), slot[j]) returns true, then *(first+i) is stored to
-   * `probe_output_begin`, and slot[j] is stored to `contained_output_begin`. If *(first+i) doesn't
-   * have matches in the map, copies *(first + i) in `probe_output_begin` and a pair of
-   * `empty_key_sentinel` and `empty_value_sentinel` in `contained_output_begin`.
-   *
-   * Behavior is undefined if the size of the output range exceeds
-   * `std::distance(probe_output_begin, probe_output_end)` (or
-   * `std::distance(contained_output_begin, contained_output_end)`). Use
-   * `pair_count()` to determine the size of the output range.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * static_multimap<K, V>::value_type></tt> is `true`
-   * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-   * `InputIt`s `value_type`.
-   * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-   * the map's `value_type`.
-   * @tparam PairEqual Binary callable type
-   * @param first Beginning of the sequence of pairs
-   * @param last End of the sequence of pairs
-   * @param probe_output_begin Beginning of the sequence of the matched probe pairs
-   * @param contained_output_begin Beginning of the sequence of the matched contained pairs
-   * @param pair_equal The binary function to compare two pairs for equality
-   * @param stream CUDA stream used for pair_retrieve_outer
-   * @return Pair of iterators pointing to the last elements in the output
-   */
-  template <typename InputIt, typename OutputIt1, typename OutputIt2, typename PairEqual>
-  std::pair<OutputIt1, OutputIt2> pair_retrieve_outer(InputIt first,
-                                                      InputIt last,
-                                                      OutputIt1 probe_output_begin,
-                                                      OutputIt2 contained_output_begin,
-                                                      PairEqual pair_equal,
-                                                      cudaStream_t stream = 0) const;
-
- private:
-  /**
-   * @brief Indicates if vector-load is used.
-   *
-   * Users have no explicit control on whether vector-load is used.
-   *
-   * @return Boolean indicating if vector-load is used.
-   */
-  static __host__ __device__ constexpr bool uses_vector_load() noexcept
-  {
-    return cuco::detail::is_packable<value_type>();
-  }
-
-  /**
-   * @brief Returns the number of pairs loaded with each vector-load
-   */
-  static __host__ __device__ constexpr uint32_t vector_width() noexcept
-  {
-    return ProbeSequence::vector_width();
-  }
-
-  /**
-   * @brief Returns the warp size.
-   */
-  static __host__ __device__ constexpr uint32_t warp_size() noexcept { return 32u; }
-
-  /**
-   * @brief Custom deleter for unique pointer of slots.
-   */
-  struct slot_deleter {
-    slot_deleter(allocator_type& a, size_t& c, cuda::stream_ref s)
-      : allocator{a}, capacity{c}, stream{s}
-    {
-    }
-
-    slot_deleter(slot_deleter const&) = default;
-
-    void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity, stream); }
-
-    allocator_type& allocator;
-    size_t& capacity;
-    cuda::stream_ref stream;
-  };
-
-  class device_view_impl_base;
-  class device_mutable_view_impl;
-  class device_view_impl;
-
-  template <typename ViewImpl>
-  class device_view_base {
-   protected:
-    // Import member type definitions from `static_multimap`
-    using value_type          = value_type;
-    using key_type            = Key;
-    using mapped_type         = Value;
-    using pair_atomic_type    = pair_atomic_type;
-    using iterator            = pair_atomic_type*;
-    using const_iterator      = pair_atomic_type const*;
-    using probe_sequence_type = probe_sequence_type;
-
-    __host__ __device__ device_view_base(pair_atomic_type* slots,
-                                         std::size_t capacity,
-                                         empty_key<Key> empty_key_sentinel,
-                                         empty_value<Value> empty_value_sentinel) noexcept
-      : impl_{slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value}
-    {
-    }
-
-   public:
-    /**
-     * @brief Gets slots array.
-     *
-     * @return Slots array
-     */
-    __device__ __forceinline__ pair_atomic_type* get_slots() noexcept { return impl_.get_slots(); }
-
-    /**
-     * @brief Gets slots array.
-     *
-     * @return Slots array
-     */
-    __device__ __forceinline__ pair_atomic_type const* get_slots() const noexcept
-    {
-      return impl_.get_slots();
-    }
-
-    /**
-     * @brief Gets the maximum number of elements the hash map can hold.
-     *
-     * @return The maximum number of elements the hash map can hold
-     */
-    __host__ __device__ __forceinline__ std::size_t get_capacity() const noexcept
-    {
-      return impl_.get_capacity();
-    }
-
-    /**
-     * @brief Gets the sentinel value used to represent an empty key slot.
-     *
-     * @return The sentinel value used to represent an empty key slot
-     */
-    __host__ __device__ __forceinline__ Key get_empty_key_sentinel() const noexcept
-    {
-      return impl_.get_empty_key_sentinel();
-    }
-
-    /**
-     * @brief Gets the sentinel value used to represent an empty value slot.
-     *
-     * @return The sentinel value used to represent an empty value slot
-     */
-    __host__ __device__ __forceinline__ Value get_empty_value_sentinel() const noexcept
-    {
-      return impl_.get_empty_value_sentinel();
-    }
-
-   protected:
-    ViewImpl impl_;
-  };  // class device_view_base
-
- public:
-  /**
-   * @brief Mutable, non-owning view-type that may be used in device code to
-   * perform singular inserts into the map.
-   *
-   * `device_mutable_view` is trivially-copyable and is intended to be passed by
-   * value.
-   *
-   * Example:
-   * \code{.cpp}
-   * cuco::static_multimap<int,int> m{100'000, -1, -1};
-   *
-   * // Inserts a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
-   * thrust::for_each(thrust::make_counting_iterator(0),
-   *                  thrust::make_counting_iterator(50'000),
-   *                  [map = m.get_device_mutable_view()]
-   *                  __device__ (auto i) mutable {
-   *                     map.insert(cuco::pair{i,i});
-   *                  });
-   * \endcode
-   */
-  class device_mutable_view : public device_view_base<device_mutable_view_impl> {
-   public:
-    using view_base_type =
-      device_view_base<device_mutable_view_impl>;              ///< Base view implementation type
-    using value_type  = typename view_base_type::value_type;   ///< Type of key/value pairs
-    using key_type    = typename view_base_type::key_type;     ///< Key type
-    using mapped_type = typename view_base_type::mapped_type;  ///< Type of the mapped values
-    using iterator =
-      typename view_base_type::iterator;  ///< Type of the forward iterator to `value_type`
-    using const_iterator =
-      typename view_base_type::const_iterator;  ///< Type of the forward iterator to `const
-                                                ///< value_type`
-
-    /**
-     * @brief Construct a mutable view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty
-     * slots
-     * @param empty_value_sentinel The reserved value for mapped values to
-     * represent empty slots
-     */
-    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
-                                            std::size_t capacity,
-                                            empty_key<Key> empty_key_sentinel,
-                                            empty_value<Value> empty_value_sentinel) noexcept
-      : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-    {
-    }
-
-    /**
-     * @brief Inserts the specified key/value pair into the map.
-     *
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group that performs the insert
-     * @param insert_pair The pair to insert
-     */
-    template <typename ParentCG>
-    __device__ __forceinline__ void insert(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      value_type const& insert_pair) noexcept;
-
-   private:
-    using device_view_base<device_mutable_view_impl>::impl_;
-  };  // class device mutable view
-
-  /**
-   * @brief Non-owning view-type that may be used in device code to
-   * perform singular find and contains operations for the map.
-   *
-   * `device_view` is trivially-copyable and is intended to be passed by
-   * value.
-   *
-   */
-  class device_view : public device_view_base<device_view_impl> {
-   public:
-    using view_base_type = device_view_base<device_view_impl>;    ///< Base view implementation type
-    using value_type     = typename view_base_type::value_type;   ///< Type of key/value pairs
-    using key_type       = typename view_base_type::key_type;     ///< Key type
-    using mapped_type    = typename view_base_type::mapped_type;  ///< Type of the mapped values
-    using iterator =
-      typename view_base_type::iterator;  ///< Type of the forward iterator to `value_type`
-    using const_iterator =
-      typename view_base_type::const_iterator;  ///< Type of the forward iterator to `const
-                                                ///< value_type`
-
-    /**
-     * @brief Construct a view of the first `capacity` slots of the
-     * slots array pointed to by `slots`.
-     *
-     * @param slots Pointer to beginning of initialized slots array
-     * @param capacity The number of slots viewed by this object
-     * @param empty_key_sentinel The reserved value for keys to represent empty
-     * slots
-     * @param empty_value_sentinel The reserved value for mapped values to
-     * represent empty slots
-     */
-    __host__ __device__ device_view(pair_atomic_type* slots,
-                                    std::size_t capacity,
-                                    empty_key<Key> empty_key_sentinel,
-                                    empty_value<Value> empty_value_sentinel) noexcept
-      : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel}
-    {
-    }
-
-    /**
-     * @brief Makes a copy of given `device_view` using non-owned memory.
-     *
-     * This function is intended to be used to create shared memory copies of small static maps,
-     * although global memory can be used as well.
-     *
-     * @tparam CG The type of the cooperative thread group
-     * @param g The cooperative thread group used to copy the slots
-     * @param source_device_view `device_view` to copy from
-     * @param memory_to_use Array large enough to support `capacity` elements. Object does not
-     * take the ownership of the memory
-     * @return Copy of passed `device_view`
-     */
-    template <typename CG>
-    __device__ __forceinline__ static device_view make_copy(
-      CG g, pair_atomic_type* const memory_to_use, device_view source_device_view) noexcept;
-
-    /**
-     * @brief Flushes per-CG buffer into the output sequence.
-     *
-     * A given CUDA Cooperative Group, `g`, loads `num_outputs` key-value pairs from `output_buffer`
-     * and writes them into global memory in a coalesced fashion. CG-wide `memcpy_sync` is used if
-     * `thrust::is_contiguous_iterator_v<OutputIt>` returns true. All threads of `g` must be active
-     * due to implicit CG-wide synchronization during flushing.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt Device accessible output iterator whose `value_type` is
-     * constructible from the map's `value_type`
-     * @param g The Cooperative Group used to flush output buffer
-     * @param num_outputs Number of valid output in the buffer
-     * @param output_buffer Buffer of the key/value pair sequence
-     * @param num_matches Size of the output sequence
-     * @param output_begin Beginning of the output sequence of key/value pairs
-     */
-    template <typename CG, typename atomicT, typename OutputIt>
-    __device__ __forceinline__ void flush_output_buffer(CG g,
-                                                        uint32_t const num_outputs,
-                                                        value_type* output_buffer,
-                                                        atomicT* num_matches,
-                                                        OutputIt output_begin) noexcept;
-
-    /**
-     * @brief Flushes per-CG buffer into the output sequences.
-     *
-     * A given CUDA Cooperative Group, `g`, loads `num_outputs` elements from `probe_output_buffer`
-     * and `num_outputs` elements from `contained_output_buffer`, then writes them into global
-     * memory started from `probe_output_begin` and `contained_output_begin` respectively. All
-     * threads of `g` must be active due to implicit CG-wide synchronization during flushing.
-     *
-     * @tparam CG Cooperative Group type
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-     * `InputIt`s `value_type`.
-     * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `value_type`.
-     * @param g The Cooperative Group used to flush output buffer
-     * @param num_outputs Number of valid output in the buffer
-     * @param probe_output_buffer Buffer of the matched probe pair sequence
-     * @param contained_output_buffer Buffer of the matched contained pair sequence
-     * @param num_matches Size of the output sequence
-     * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-     * @param contained_output_begin Beginning of the output sequence of the matched contained
-     * pairs
-     */
-    template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
-    __device__ __forceinline__ void flush_output_buffer(CG g,
-                                                        uint32_t const num_outputs,
-                                                        value_type* probe_output_buffer,
-                                                        value_type* contained_output_buffer,
-                                                        atomicT* num_matches,
-                                                        OutputIt1 probe_output_begin,
-                                                        OutputIt2 contained_output_begin) noexcept;
-
-    /**
-     * @brief Indicates whether the key `k` exists in the map.
-     *
-     * If the key `k` was inserted into the map, `contains` returns
-     * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to
-     * to leverage multiple threads to perform a single `contains` operation. This provides a
-     * significant boost in throughput compared to the non Cooperative Group
-     * `contains` at moderate to high load factors.
-     *
-     * ProbeSequence hashers should be callable with both ProbeKey and Key type.
-     * `std::invoke_result<KeyEqual, ProbeKey, Key>` must be well-formed.
-     *
-     * If `key_equal(probe_key, slot_key)` returns true, `hash(probe_key) == hash(slot_key)` must
-     * also be true.
-     *
-     * @tparam ProbeKey Probe key type
-     * @tparam KeyEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the contains operation
-     * @param k The key to search for
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return A boolean indicating whether the key/value pair
-     * containing `k` was inserted
-     */
-    template <typename ProbeKey,
-              typename KeyEqual = cuda::std::equal_to<key_type>,
-              typename ParentCG = void>
-    __device__ __forceinline__ bool contains(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      ProbeKey const& k,
-      KeyEqual key_equal = KeyEqual{}) const noexcept;
-
-    /**
-     * @brief Indicates whether the pair `p` exists in the map.
-     *
-     * If the pair `p` was inserted into the map, `contains` returns
-     * true. Otherwise, it returns false. Uses the CUDA Cooperative Groups API to
-     * to leverage multiple threads to perform a single `contains` operation. This provides a
-     * significant boost in throughput compared to the non Cooperative Group
-     * `contains` at moderate to high load factors.
-     *
-     * ProbeSequence hashers should be callable with both ProbePair::first_type and Key type.
-     * `std::invoke_result<KeyEqual, ProbePair::first_type, Key>` must be well-formed.
-     *
-     * If `pair_equal(p, slot_content)` returns true, `hash(p.first) == hash(slot_key)` must
-     * also be true.
-     *
-     * @tparam ProbePair Probe pair type
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the contains operation
-     * @param p The pair to search for
-     * @param pair_equal The binary callable used to compare input pair and slot content
-     * for equality
-     * @return A boolean indicating whether the input pair was inserted in the map
-     */
-    template <typename ProbePair, typename PairEqual, typename ParentCG>
-    __device__ __forceinline__ bool pair_contains(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      ProbePair const& p,
-      PairEqual pair_equal) const noexcept;
-
-    /**
-     * @brief Counts the occurrence of a given key contained in multimap.
-     *
-     * For a given key, `k`, counts all matching keys, `k'`, as determined by `key_equal(k, k')` and
-     * returns the sum of all matches for `k`.
-     *
-     * @tparam KeyEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the count operation
-     * @param k The key to search for
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return Number of matches found by the current thread
-     */
-    template <typename KeyEqual = cuda::std::equal_to<key_type>, typename ParentCG = void>
-    __device__ __forceinline__ std::size_t count(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      Key const& k,
-      KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Counts the occurrence of a given key contained in multimap. If no
-     * matches can be found for a given key, the corresponding occurrence is 1.
-     *
-     * For a given key, `k`, counts all matching keys, `k'`, as determined by `key_equal(k, k')` and
-     * returns the sum of all matches for `k`. If `k` does not have any matches, returns 1.
-     *
-     * @tparam KeyEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the count operation
-     * @param k The key to search for
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     * @return Number of matches found by the current thread
-     */
-    template <typename KeyEqual = cuda::std::equal_to<key_type>, typename ParentCG = void>
-    __device__ __forceinline__ std::size_t count_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      Key const& k,
-      KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Counts the occurrence of a given key/value pair contained in multimap.
-     *
-     * For a given pair, `p`, counts all matching pairs, `p'`, as determined by `pair_equal(p, p')`
-     * and returns the sum of all matches for `p`.
-     *
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the pair_count operation
-     * @param pair The pair to search for
-     * @param pair_equal The binary callable used to compare two pairs
-     * for equality
-     * @return Number of matches found by the current thread
-     */
-    template <typename PairEqual, typename ParentCG>
-    __device__ __forceinline__ std::size_t pair_count(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      value_type const& pair,
-      PairEqual pair_equal) noexcept;
-
-    /**
-     * @brief Counts the occurrence of a given key/value pair contained in multimap.
-     * If no matches can be found for a given key, the corresponding occurrence is 1.
-     *
-     * For a given pair, `p`, counts all matching pairs, `p'`, as determined by `pair_equal(p, p')`
-     * and returns the sum of all matches for `p`. If `p` does not have any matches, returns 1.
-     *
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param g The Cooperative Group used to perform the pair_count operation
-     * @param pair The pair to search for
-     * @param pair_equal The binary callable used to compare two pairs
-     * for equality
-     * @return Number of matches found by the current thread
-     */
-    template <typename PairEqual, typename ParentCG>
-    __device__ __forceinline__ std::size_t pair_count_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
-      value_type const& pair,
-      PairEqual pair_equal) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given key contained in multimap with per-flushing-CG
-     * shared memory buffer.
-     *
-     * For key `k` existing in the map, copies `k` and all associated values to unspecified
-     * locations in `[output_begin, output_end)`.
-     *
-     * @tparam buffer_size Size of the output buffer
-     * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt Device accessible output iterator whose `value_type` is
-     * constructible from the map's `value_type`
-     * @tparam KeyEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param flushing_cg The Cooperative Group used to flush output buffer
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param k The key to search for
-     * @param flushing_cg_counter Pointer to flushing_cg counter
-     * @param output_buffer Shared memory buffer of the key/value pair sequence
-     * @param num_matches Size of the output sequence
-     * @param output_begin Beginning of the output sequence of key/value pairs
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     */
-    template <uint32_t buffer_size,
-              typename FlushingCG,
-              typename atomicT,
-              typename OutputIt,
-              typename KeyEqual = cuda::std::equal_to<key_type>,
-              typename ParentCG = void>
-    __device__ __forceinline__ void retrieve(
-      FlushingCG flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      Key const& k,
-      uint32_t* flushing_cg_counter,
-      value_type* output_buffer,
-      atomicT* num_matches,
-      OutputIt output_begin,
-      KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given key contained in multimap with per-flushing-CG
-     * shared memory buffer.
-     *
-     * For key `k` existing in the map, copies `k` and all associated values to unspecified
-     * locations in `[output_begin, output_end)`. If `k` does not have any matches, copies `k` and
-     * `empty_value_sentinel()` into the output.
-     *
-     * @tparam buffer_size Size of the output buffer
-     * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt Device accessible output iterator whose `value_type` is
-     * constructible from the map's `value_type`
-     * @tparam KeyEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param flushing_cg The Cooperative Group used to flush output buffer
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param k The key to search for
-     * @param flushing_cg_counter Pointer to flushing_cg counter
-     * @param output_buffer Shared memory buffer of the key/value pair sequence
-     * @param num_matches Size of the output sequence
-     * @param output_begin Beginning of the output sequence of key/value pairs
-     * @param key_equal The binary callable used to compare two keys
-     * for equality
-     */
-    template <uint32_t buffer_size,
-              typename FlushingCG,
-              typename atomicT,
-              typename OutputIt,
-              typename KeyEqual = cuda::std::equal_to<key_type>,
-              typename ParentCG = void>
-    __device__ __forceinline__ void retrieve_outer(
-      FlushingCG flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      Key const& k,
-      uint32_t* flushing_cg_counter,
-      value_type* output_buffer,
-      atomicT* num_matches,
-      OutputIt output_begin,
-      KeyEqual key_equal = KeyEqual{}) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given pair
-     *
-     * For pair `p` with `n = pair_count(cg, p, pair_equal)` matching pairs, if `pair_equal(p,
-     * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`,
-     * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an
-     * unspecified value of `j` where `0 <= j < n`.
-     *
-     * Concurrent reads or writes to any of the output ranges results in undefined behavior.
-     *
-     * Behavior is undefined if the extent of any of the output ranges is less than `n`.
-     *
-     * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-     * `pair`'s `Key` type.
-     * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-     * `pair`'s `Value` type.
-     * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `key_type`.
-     * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `mapped_type`.
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param pair The pair to search for
-     * @param probe_key_begin Beginning of the output sequence of the matched probe keys
-     * @param probe_val_begin Beginning of the output sequence of the matched probe values
-     * @param contained_key_begin Beginning of the output sequence of the matched contained keys
-     * @param contained_val_begin Beginning of the output sequence of the matched contained values
-     * @param pair_equal The binary callable used to compare two pairs for equality
-     */
-    template <typename OutputIt1,
-              typename OutputIt2,
-              typename OutputIt3,
-              typename OutputIt4,
-              typename PairEqual,
-              typename ParentCG>
-    __device__ __forceinline__ void pair_retrieve(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      value_type const& pair,
-      OutputIt1 probe_key_begin,
-      OutputIt2 probe_val_begin,
-      OutputIt3 contained_key_begin,
-      OutputIt4 contained_val_begin,
-      PairEqual pair_equal) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG
-     * shared memory buffer.
-     *
-     * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations
-     * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in
-     * `[contained_output_begin, contained_output_end)`.
-     *
-     * @tparam buffer_size Size of the output buffer
-     * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-     * `InputIt`s `value_type`.
-     * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `value_type`.
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param flushing_cg The Cooperative Group used to flush output buffer
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param pair The pair to search for
-     * @param warp_counter Pointer to the warp counter
-     * @param probe_output_buffer Buffer of the matched probe pair sequence
-     * @param contained_output_buffer Buffer of the matched contained pair sequence
-     * @param num_matches Size of the output sequence
-     * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-     * @param contained_output_begin Beginning of the output sequence of the matched contained
-     * pairs
-     * @param pair_equal The binary callable used to compare two pairs for equality
-     */
-    template <uint32_t buffer_size,
-              typename FlushingCG,
-              typename atomicT,
-              typename OutputIt1,
-              typename OutputIt2,
-              typename PairEqual,
-              typename ParentCG>
-    __device__ __forceinline__ void pair_retrieve(
-      FlushingCG flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      value_type const& pair,
-      uint32_t* warp_counter,
-      value_type* probe_output_buffer,
-      value_type* contained_output_buffer,
-      atomicT* num_matches,
-      OutputIt1 probe_output_begin,
-      OutputIt2 contained_output_begin,
-      PairEqual pair_equal) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given pair
-     *
-     * For pair `p` with `n = pair_count_outer(cg, p, pair_equal)` matching pairs, if `pair_equal(p,
-     * slot)` returns true, stores `probe_key_begin[j] = p.first`, `probe_val_begin[j] = p.second`,
-     * `contained_key_begin[j] = slot.first`, and `contained_val_begin[j] = slot.second` for an
-     * unspecified value of `j` where `0 <= j < n`. If `p` does not have any matches, stores
-     * `probe_key_begin[0] = p.first`, `probe_val_begin[0] = p.second`, `contained_key_begin[0] =
-     * empty_key_sentinel`, and `contained_val_begin[0] = empty_value_sentinel`.
-     *
-     * Concurrent reads or writes to any of the output ranges results in undefined behavior.
-     *
-     * Behavior is undefined if the extent of any of the output ranges is less than `n`.
-     *
-     * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-     * `pair`'s `Key` type.
-     * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-     * `pair`'s `Value` type.
-     * @tparam OutputIt3 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `key_type`.
-     * @tparam OutputIt4 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `mapped_type`.
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param pair The pair to search for
-     * @param probe_key_begin Beginning of the output sequence of the matched probe keys
-     * @param probe_val_begin Beginning of the output sequence of the matched probe values
-     * @param contained_key_begin Beginning of the output sequence of the matched contained keys
-     * @param contained_val_begin Beginning of the output sequence of the matched contained values
-     * @param pair_equal The binary callable used to compare two pairs for equality
-     */
-    template <typename OutputIt1,
-              typename OutputIt2,
-              typename OutputIt3,
-              typename OutputIt4,
-              typename PairEqual,
-              typename ParentCG>
-    __device__ __forceinline__ void pair_retrieve_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      value_type const& pair,
-      OutputIt1 probe_key_begin,
-      OutputIt2 probe_val_begin,
-      OutputIt3 contained_key_begin,
-      OutputIt4 contained_val_begin,
-      PairEqual pair_equal) noexcept;
-
-    /**
-     * @brief Retrieves all the matches of a given pair contained in multimap with per-flushing-CG
-     * shared memory buffer.
-     *
-     * For pair `p`, if pair_equal(p, slot[j]) returns true, copies `p` to unspecified locations
-     * in `[probe_output_begin, probe_output_end)` and copies slot[j] to unspecified locations in
-     * `[contained_output_begin, contained_output_end)`. If `p` does not have any matches, copies
-     * `p` and a pair of `empty_key_sentinel` and `empty_value_sentinel` into the output.
-     *
-     * @tparam buffer_size Size of the output buffer
-     * @tparam FlushingCG Type of Cooperative Group used to flush output buffer
-     * @tparam atomicT Type of atomic storage
-     * @tparam OutputIt1 Device accessible output iterator whose `value_type` is constructible from
-     * `InputIt`s `value_type`.
-     * @tparam OutputIt2 Device accessible output iterator whose `value_type` is constructible from
-     * the map's `value_type`.
-     * @tparam PairEqual Binary callable type
-     * @tparam ParentCG Type of parent Cooperative Group
-     *
-     * @param flushing_cg The Cooperative Group used to flush output buffer
-     * @param probing_cg The Cooperative Group used to retrieve
-     * @param pair The pair to search for
-     * @param flushing_cg_counter Pointer to the flushing CG counter
-     * @param probe_output_buffer Buffer of the matched probe pair sequence
-     * @param contained_output_buffer Buffer of the matched contained pair sequence
-     * @param num_matches Size of the output sequence
-     * @param probe_output_begin Beginning of the output sequence of the matched probe pairs
-     * @param contained_output_begin Beginning of the output sequence of the matched contained
-     * pairs
-     * @param pair_equal The binary callable used to compare two pairs for equality
-     */
-    template <uint32_t buffer_size,
-              typename FlushingCG,
-              typename atomicT,
-              typename OutputIt1,
-              typename OutputIt2,
-              typename PairEqual,
-              typename ParentCG>
-    __device__ __forceinline__ void pair_retrieve_outer(
-      FlushingCG flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
-      value_type const& pair,
-      uint32_t* flushing_cg_counter,
-      value_type* probe_output_buffer,
-      value_type* contained_output_buffer,
-      atomicT* num_matches,
-      OutputIt1 probe_output_begin,
-      OutputIt2 contained_output_begin,
-      PairEqual pair_equal) noexcept;
-
-   private:
-    using device_view_base<device_view_impl>::impl_;  ///< Implementation detail of `device_view`
-  };  // class device_view
-
-  /**
-   * @brief Return the raw pointer of the hash map slots.
-   *
-   * @return Raw pointer of the hash map slots
-   */
-  value_type* raw_slots() noexcept
-  {
-    // Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access.
-    // TODO: to be replace by atomic_ref when it's ready
-    return reinterpret_cast<value_type*>(slots_.get());
-  }
-
-  /**
-   * @brief Return the raw pointer of the hash map slots.
-   *
-   * @return Raw pointer of the hash map slots
-   */
-  value_type const* raw_slots() const noexcept
-  {
-    // Unsafe access to the slots stripping away their atomic-ness to allow non-atomic access.
-    // TODO: to be replace by atomic_ref when it's ready
-    return reinterpret_cast<value_type const*>(slots_.get());
-  }
-
-  /**
-   * @brief Gets the maximum number of elements the hash map can hold.
-   *
-   * @return The maximum number of elements the hash map can hold
-   */
-  std::size_t get_capacity() const noexcept { return capacity_; }
-
-  /**
-   * @brief Gets the number of elements in the hash map.
-   *
-   * @param stream CUDA stream used to get the number of inserted elements
-   * @return The number of elements in the map
-   */
-  std::size_t get_size(cudaStream_t stream = 0) const noexcept;
-
-  /**
-   * @brief Gets the load factor of the hash map.
-   *
-   * @param stream CUDA stream used to get the load factor
-   * @return The load factor of the hash map
-   */
-  float get_load_factor(cudaStream_t stream = 0) const noexcept;
-
-  /**
-   * @brief Gets the sentinel value used to represent an empty key slot.
-   *
-   * @return The sentinel value used to represent an empty key slot
-   */
-  Key get_empty_key_sentinel() const noexcept { return empty_key_sentinel_; }
-
-  /**
-   * @brief Gets the sentinel value used to represent an empty value slot.
-   *
-   * @return The sentinel value used to represent an empty value slot
-   */
-  Value get_empty_value_sentinel() const noexcept { return empty_value_sentinel_; }
-
-  /**
-   * @brief Constructs a device_view object based on the members of the `static_multimap`
-   * object.
-   *
-   * @return A device_view object based on the members of the `static_multimap` object
-   */
-  device_view get_device_view() const noexcept
-  {
-    return device_view(slots_.get(),
-                       capacity_,
-                       empty_key<Key>{empty_key_sentinel_},
-                       empty_value<Value>{empty_value_sentinel_});
-  }
-
-  /**
-   * @brief Constructs a device_mutable_view object based on the members of the
-   * `static_multimap` object
-   *
-   * @return A device_mutable_view object based on the members of the `static_multimap` object
-   */
-  device_mutable_view get_device_mutable_view() const noexcept
-  {
-    return device_mutable_view(slots_.get(),
-                               capacity_,
-                               empty_key<Key>{empty_key_sentinel_},
-                               empty_value<Value>{empty_value_sentinel_});
-  }
-
- private:
-  std::size_t capacity_{};        ///< Total number of slots
-  Key empty_key_sentinel_{};      ///< Key value that represents an empty slot
-  Value empty_value_sentinel_{};  ///< Initial value of empty slot
-  allocator_type allocator_{};    ///< Allocator used to allocate slots
-  slot_deleter delete_slots_;     ///< Custom slots deleter
-  std::unique_ptr<pair_atomic_type, slot_deleter> slots_{};  ///< Pointer to flat slots storage
-};  // class static_multimap
 
 }  // namespace cuco
 
-#include <cuco/detail/static_multimap/device_view_impl.inl>
 #include <cuco/detail/static_multimap/static_multimap.inl>
diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu
index 6ceb92b0a..3a0a2dfa7 100644
--- a/tests/static_multimap/count_test.cu
+++ b/tests/static_multimap/count_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,14 +90,14 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::default_hash_function<T>>,
     cuco::double_hashing<CGSize, cuco::default_hash_function<T>, cuco::default_hash_function<T>>>;
 
-  auto map = cuco::experimental::static_multimap<T,
-                                                 T,
-                                                 cuco::extent<size_type>,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<T>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<T,
+                                   T,
+                                   cuco::extent<size_type>,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<T>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     num_keys * multiplicity, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
 
   test_multiplicity_count(map, num_keys);
diff --git a/tests/static_multimap/find_test.cu b/tests/static_multimap/find_test.cu
index 53e10f40d..68e477409 100644
--- a/tests/static_multimap/find_test.cu
+++ b/tests/static_multimap/find_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,14 +119,14 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::default_hash_function<T>>,
     cuco::double_hashing<CGSize, cuco::default_hash_function<T>, cuco::default_hash_function<T>>>;
 
-  auto map = cuco::experimental::static_multimap<T,
-                                                 T,
-                                                 cuco::extent<size_type>,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<T>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<T,
+                                   T,
+                                   cuco::extent<size_type>,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<T>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     num_keys, cuco::empty_key<T>{KEY_SENTINEL}, cuco::empty_value<T>{VAL_SENTINEL}};
 
   test_multimap_find(map, num_keys);
diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu
index 826f915d6..fc511ee58 100644
--- a/tests/static_multimap/for_each_test.cu
+++ b/tests/static_multimap/for_each_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,13 +120,13 @@ TEMPLATE_TEST_CASE_SIG(
                                    cuco::linear_probing<CGSize, cuco::default_hash_function<Key>>,
                                    cuco::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
 
-  auto set = cuco::experimental::static_multimap{num_keys,
-                                                 cuco::empty_key<Key>{-1},
-                                                 cuco::empty_value<Key>{-1},
-                                                 {},
-                                                 probe{},
-                                                 {},
-                                                 cuco::storage<2>{}};
+  auto set = cuco::static_multimap{num_keys,
+                                   cuco::empty_key<Key>{-1},
+                                   cuco::empty_value<Key>{-1},
+                                   {},
+                                   probe{},
+                                   {},
+                                   cuco::storage<2>{}};
 
   auto unique_keys_begin  = thrust::counting_iterator<Key>(0);
   auto gen_duplicate_keys = cuda::proclaim_return_type<Key>(
diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu
index 78bc5c2b0..13d8d1e4e 100644
--- a/tests/static_multimap/heterogeneous_lookup_test.cu
+++ b/tests/static_multimap/heterogeneous_lookup_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 
 // insert key type
 template <typename T>
-struct key_pair {
+struct __align__(sizeof(T) * 2) key_pair {
   T a;
   T b;
 
@@ -73,19 +73,16 @@ struct custom_hasher {
 // User-defined device key equality
 struct custom_key_equal {
   template <typename LHS, typename RHS>
-  __device__ bool operator()(LHS const& lhs, RHS const& rhs)
+  __device__ bool operator()(LHS const& lhs, RHS const& rhs) const
   {
     return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a;
   }
 };
 
-TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests",
-                   "",
-#if defined(CUCO_HAS_INDEPENDENT_THREADS)  // Key type larger than 8B only supported for sm_70 and
-                                           // up
-                   int64_t,
-#endif
-                   int32_t)
+TEMPLATE_TEST_CASE(
+  "static_multimap heterogeneous lookup tests",
+  "",
+  int32_t)  // key_pair<int32_t> = 8 bytes, key_pair<int64_t> = 16 bytes (exceeds 8-byte limit)
 {
   using Key      = key_pair<TestType>;
   using Value    = TestType;
@@ -98,10 +95,15 @@ TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests",
   constexpr std::size_t capacity = num * 2;
   cuco::static_multimap<Key,
                         Value,
+                        cuco::extent<std::size_t>,
                         cuda::thread_scope_device,
-                        cuco::cuda_allocator<char>,
-                        cuco::legacy::linear_probing<1, custom_hasher>>
-    map{capacity, cuco::empty_key<Key>{sentinel_key}, cuco::empty_value<Value>{sentinel_value}};
+                        custom_key_equal,
+                        cuco::linear_probing<1, custom_hasher>,
+                        cuco::cuda_allocator<cuda::std::byte>,
+                        cuco::storage<2>>
+    map{cuco::extent<std::size_t>{capacity},
+        cuco::empty_key<Key>{sentinel_key},
+        cuco::empty_value<Value>{sentinel_value}};
 
   auto insert_pairs = thrust::make_transform_iterator(
     thrust::counting_iterator<int>(0),
@@ -115,14 +117,14 @@ TEMPLATE_TEST_CASE("static_multimap heterogeneous lookup tests",
   {
     thrust::device_vector<bool> contained(num);
     map.insert(insert_pairs, insert_pairs + num);
-    map.contains(probe_keys, probe_keys + num, contained.begin(), custom_key_equal{});
+    map.contains(probe_keys, probe_keys + num, contained.begin());
     REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
   }
 
   SECTION("Non-inserted keys-value pairs should not be contained")
   {
     thrust::device_vector<bool> contained(num);
-    map.contains(probe_keys, probe_keys + num, contained.begin(), custom_key_equal{});
+    map.contains(probe_keys, probe_keys + num, contained.begin());
     REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{}));
   }
 }
diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu
index 4185c281a..834795a0a 100644
--- a/tests/static_multimap/insert_contains_test.cu
+++ b/tests/static_multimap/insert_contains_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,14 +106,14 @@ TEMPLATE_TEST_CASE_SIG(
           cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
           cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
 
-  auto map = cuco::experimental::static_multimap<Key,
-                                                 Value,
-                                                 extent_type,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<Key>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<Key,
+                                   Value,
+                                   extent_type,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<Key>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     extent_type{num_keys}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   test_insert(map, num_keys);
diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu
index 4045a77bd..a5cc07e62 100644
--- a/tests/static_multimap/insert_if_test.cu
+++ b/tests/static_multimap/insert_if_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,14 +96,14 @@ TEMPLATE_TEST_CASE_SIG(
           cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
           cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
 
-  auto map = cuco::experimental::static_multimap<Key,
-                                                 Value,
-                                                 extent_type,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<Key>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<Key,
+                                   Value,
+                                   extent_type,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<Key>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   test_insert_if(map, num_keys);
diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu
index 23f9fb975..fa63a4beb 100644
--- a/tests/static_multimap/multiplicity_test.cu
+++ b/tests/static_multimap/multiplicity_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,14 +101,14 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::default_hash_function<T>>,
     cuco::double_hashing<CGSize, cuco::default_hash_function<T>, cuco::default_hash_function<T>>>;
 
-  auto map = cuco::experimental::static_multimap<T,
-                                                 T,
-                                                 cuco::extent<std::size_t>,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<T>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<T,
+                                   T,
+                                   cuco::extent<std::size_t>,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<T>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     num_items * 2, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
 
   test_multiplicity_two(map, num_items);
diff --git a/tests/static_multimap/retrieve_if_test.cu b/tests/static_multimap/retrieve_if_test.cu
index 6a30ba38d..44770395f 100644
--- a/tests/static_multimap/retrieve_if_test.cu
+++ b/tests/static_multimap/retrieve_if_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,7 +115,7 @@ TEMPLATE_TEST_CASE_SIG("static_multimap retrieve_if",
 {
   constexpr size_type num_keys{400};
 
-  using container_type = cuco::experimental::static_multimap<Key, Value>;
+  using container_type = cuco::static_multimap<Key, Value>;
 
   container_type container{num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_multimap/retrieve_test.cu b/tests/static_multimap/retrieve_test.cu
index c30a807cb..23418f015 100644
--- a/tests/static_multimap/retrieve_test.cu
+++ b/tests/static_multimap/retrieve_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,14 +118,14 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::default_hash_function<T>>,
     cuco::double_hashing<CGSize, cuco::default_hash_function<T>, cuco::default_hash_function<T>>>;
 
-  auto map = cuco::experimental::static_multimap<T,
-                                                 T,
-                                                 cuco::extent<std::size_t>,
-                                                 cuda::thread_scope_device,
-                                                 cuda::std::equal_to<T>,
-                                                 probe,
-                                                 cuco::cuda_allocator<cuda::std::byte>,
-                                                 cuco::storage<2>>{
+  auto map = cuco::static_multimap<T,
+                                   T,
+                                   cuco::extent<std::size_t>,
+                                   cuda::thread_scope_device,
+                                   cuda::std::equal_to<T>,
+                                   probe,
+                                   cuco::cuda_allocator<cuda::std::byte>,
+                                   cuco::storage<2>>{
     num_items * 2, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
 
   test_retrieve(map, num_items);