NVIDIA · tarang-jain · Jun 23, 2026 · Jun 23, 2026 · Jun 24, 2026 · Jun 24, 2026
@@ -11,6 +11,11 @@
 #include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/core/c_api.h>
 
+#include <raft/core/copy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
 #include "../core/exceptions.hpp"
 #include "../core/interop.hpp"
 
@@ -213,10 +218,14 @@ void _cluster_cost(cuvsResources_t res,
   if (cuvs::core::is_dlpack_device_compatible(X)) {
     using mdspan_type = raft::device_matrix_view<const T, IdxT, raft::row_major>;
 
+    auto d_cost = raft::make_device_scalar<T>(*res_ptr, T{0});
     cuvs::cluster::kmeans::cluster_cost(*res_ptr,
                                         cuvs::core::from_dlpack<mdspan_type>(X_tensor),
                                         cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
-                                        raft::make_host_scalar_view<T>(&cost_temp));
+                                        d_cost.view());
+    raft::copy(
+      *res_ptr, raft::make_host_scalar_view<T>(&cost_temp), raft::make_const_mdspan(d_cost.view()));
+    raft::resource::sync_stream(*res_ptr);
   } else {
     RAFT_FAIL("X dataset must be accessible on device memory");
   }

@@ -1532,14 +1532,16 @@ void transform(raft::resources const& handle,
  * @param[out] cost           Resulting cluster cost
  * @param[in]  sample_weight  Optional per-sample weights.
  *                            [len = n_samples]
- *
+ * @param[in]  X_norm         Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
+ *                            When provided, the internal norm computation is skipped.
  */
 void cluster_cost(
   const raft::resources& handle,
   raft::device_matrix_view<const float, int> X,
   raft::device_matrix_view<const float, int> centroids,
-  raft::host_scalar_view<float> cost,
-  std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt);
+  raft::device_scalar_view<float> cost,
+  std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt,
+  std::optional<raft::device_vector_view<const float, int>> X_norm        = std::nullopt);
 
 /**
  * @brief Compute cluster cost
@@ -1554,13 +1556,17 @@ void cluster_cost(
  * @param[out] cost           Resulting cluster cost
  * @param[in]  sample_weight  Optional per-sample weights.
  *                            [len = n_samples]
+ * @param[in]  X_norm         Optional precomputed squared L2 row norms of X (||x||^2,
+ *                            i.e. sum of squares without the sqrt) [n_samples]. When
+ *                            provided, the internal norm computation is skipped.
  */
 void cluster_cost(
   const raft::resources& handle,
   raft::device_matrix_view<const double, int> X,
   raft::device_matrix_view<const double, int> centroids,
-  raft::host_scalar_view<double> cost,
-  std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt);
+  raft::device_scalar_view<double> cost,
+  std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt,
+  std::optional<raft::device_vector_view<const double, int>> X_norm        = std::nullopt);
 
 /**
  * @brief Compute (optionally weighted) cluster cost
@@ -1575,13 +1581,17 @@ void cluster_cost(
  * @param[out] cost           Resulting cluster cost
  * @param[in]  sample_weight  Optional per-sample weights.
  *                            [len = n_samples]
+ * @param[in]  X_norm         Optional precomputed squared L2 row norms of X (||x||^2,
+ *                            i.e. sum of squares without the sqrt) [n_samples]. When
+ *                            provided, the internal norm computation is skipped.
  */
 void cluster_cost(
   const raft::resources& handle,
   raft::device_matrix_view<const float, int64_t> X,
   raft::device_matrix_view<const float, int64_t> centroids,
-  raft::host_scalar_view<float> cost,
-  std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt);
+  raft::device_scalar_view<float> cost,
+  std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt,
+  std::optional<raft::device_vector_view<const float, int64_t>> X_norm        = std::nullopt);
 
 /**
  * @brief Compute (optionally weighted) cluster cost
@@ -1596,13 +1606,98 @@ void cluster_cost(
  * @param[out] cost           Resulting cluster cost
  * @param[in]  sample_weight  Optional per-sample weights.
  *                            [len = n_samples]
+ * @param[in]  X_norm         Optional precomputed squared L2 row norms of X (||x||^2,
+ *                            i.e. sum of squares without the sqrt) [n_samples]. When
+ *                            provided, the internal norm computation is skipped.
  */
 void cluster_cost(
+  const raft::resources& handle,
+  raft::device_matrix_view<const double, int64_t> X,
+  raft::device_matrix_view<const double, int64_t> centroids,
+  raft::device_scalar_view<double> cost,
+  std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt,
+  std::optional<raft::device_vector_view<const double, int64_t>> X_norm        = std::nullopt);
+
+/**
+ * @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
+ *        scalar.
+ *
+ * @param[in]  handle         The raft handle
+ * @param[in]  X              Training instances [n_samples x n_features], row-major
+ * @param[in]  centroids      Cluster centroids [n_clusters x n_features], row-major
+ * @param[out] cost           Resulting cluster cost
+ * @param[in]  sample_weight  Optional per-sample weights [n_samples]
+ */
+[[deprecated(
+  "Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
+  "stream sync")]] void
+cluster_cost(
+  const raft::resources& handle,
+  raft::device_matrix_view<const float, int> X,
+  raft::device_matrix_view<const float, int> centroids,
+  raft::host_scalar_view<float> cost,
+  std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt);
+
+/**
+ * @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
+ *        scalar.
+ *
+ * @param[in]  handle         The raft handle
+ * @param[in]  X              Training instances [n_samples x n_features], row-major
+ * @param[in]  centroids      Cluster centroids [n_clusters x n_features], row-major
+ * @param[out] cost           Resulting cluster cost
+ * @param[in]  sample_weight  Optional per-sample weights [n_samples]
+ */
+[[deprecated(
+  "Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
+  "stream sync")]] void
+cluster_cost(
+  const raft::resources& handle,
+  raft::device_matrix_view<const double, int> X,
+  raft::device_matrix_view<const double, int> centroids,
+  raft::host_scalar_view<double> cost,
+  std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt);
+
+/**
+ * @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
+ *        scalar.
+ *
+ * @param[in]  handle         The raft handle
+ * @param[in]  X              Training instances [n_samples x n_features], row-major
+ * @param[in]  centroids      Cluster centroids [n_clusters x n_features], row-major
+ * @param[out] cost           Resulting cluster cost
+ * @param[in]  sample_weight  Optional per-sample weights [n_samples]
+ */
+[[deprecated(
+  "Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
+  "stream sync")]] void
+cluster_cost(
+  const raft::resources& handle,
+  raft::device_matrix_view<const float, int64_t> X,
+  raft::device_matrix_view<const float, int64_t> centroids,
+  raft::host_scalar_view<float> cost,
+  std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt);
+
+/**
+ * @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
+ *        scalar.
+ *
+ * @param[in]  handle         The raft handle
+ * @param[in]  X              Training instances [n_samples x n_features], row-major
+ * @param[in]  centroids      Cluster centroids [n_clusters x n_features], row-major
+ * @param[out] cost           Resulting cluster cost
+ * @param[in]  sample_weight  Optional per-sample weights [n_samples]
+ */
+[[deprecated(
+  "Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
+  "stream sync")]] void
+cluster_cost(
   const raft::resources& handle,
   raft::device_matrix_view<const double, int64_t> X,
   raft::device_matrix_view<const double, int64_t> centroids,
   raft::host_scalar_view<double> cost,
   std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt);
+
 /**
  * @}
  */

@@ -931,7 +931,11 @@ void kmeans_fit(
       auto centroids_const = raft::make_device_matrix_view<const DataT, IndexT>(
         cur_centroids_ptr, n_clusters, n_features);
 
-      iter_inertia = DataT{0};
+      auto d_iter_inertia = raft::make_device_scalar<DataT>(handle, DataT{0});
+      auto d_batch_cost   = raft::make_device_scalar<DataT>(handle, DataT{0});
+      DataT* p_acc        = d_iter_inertia.data_handle();
+      DataT* p_batch      = d_batch_cost.data_handle();
+
       data_batches.reset();
       using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
       std::optional<wt_iter_t> wt_it;
@@ -956,15 +960,33 @@ void kmeans_fit(
             cur_batch_weights(static_cast<IndexT>(data_batch.offset()), wt_data, cur_batch_size);
         }
 
-        DataT batch_cost = DataT{0};
-        cuvs::cluster::kmeans::cluster_cost(handle,
-                                            batch_data_view,
-                                            centroids_const,
-                                            raft::make_host_scalar_view(&batch_cost),
-                                            batch_sw);
+        std::optional<raft::device_vector_view<const DataT, IndexT>> batch_xnorm = std::nullopt;
+        if (need_compute_norms) {
+          if constexpr (data_on_device) {
+            batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
+              L2NormBatch.data_handle() + data_batch.offset(), cur_batch_size);
+          } else if (norms_cached) {
+            raft::copy(L2NormBatch.data_handle(),
+                       h_norm_cache.data_handle() + data_batch.offset(),
+                       cur_batch_size,
+                       stream);
+            batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
+              L2NormBatch.data_handle(), cur_batch_size);
+          }
+        }
+
+        cuvs::cluster::kmeans::cluster_cost(
+          handle, batch_data_view, centroids_const, d_batch_cost.view(), batch_sw, batch_xnorm);
 
-        iter_inertia += batch_cost;
+        raft::linalg::map_offset(handle,
+                                 raft::make_device_vector_view<DataT, int>(p_acc, 1),
+                                 [p_acc, p_batch] __device__(int) { return *p_acc + *p_batch; });
       }
+
+      raft::copy(handle,
+                 raft::make_host_scalar_view<DataT>(&iter_inertia),
+                 raft::make_const_mdspan(d_iter_inertia.view()));
+      raft::resource::sync_stream(handle);
     }
 
     if (iter_inertia < inertia[0]) {

@@ -1139,8 +1139,12 @@ void build_hierarchical(const raft::resources& handle,
         reinterpret_cast<const MathT*>(dataset), n_rows, dim);
       auto centroids_view =
         raft::make_device_matrix_view<const MathT, IdxT>(cluster_centers, n_clusters, dim);
-      cuvs::cluster::kmeans::cluster_cost(
-        handle, X_view, centroids_view, raft::make_host_scalar_view<MathT>(inertia));
+      auto d_inertia = raft::make_device_scalar<MathT>(handle, MathT{0});
+      cuvs::cluster::kmeans::cluster_cost(handle, X_view, centroids_view, d_inertia.view());
+      raft::copy(handle,
+                 raft::make_host_scalar_view<MathT>(inertia),
+                 raft::make_const_mdspan(d_inertia.view()));
+      raft::resource::sync_stream(handle, stream);
     } else {
       RAFT_LOG_WARN("Inertia is not computed for non float/double types");
     }

@@ -382,14 +382,17 @@ void min_cluster_distance(raft::resources const& handle,
  * @param[in]  centroids      Cluster centroids [n_clusters x n_features]
  * @param[out] cost           Sum of squared distances to nearest centroid (device)
  * @param[in]  sample_weight  Optional per-sample weights [n_samples]
+ * @param[in]  X_norm         Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
+ *                            When provided, the internal norm computation is skipped.
  */
 template <typename DataT, typename IndexT>
 void cluster_cost(
   raft::resources const& handle,
   raft::device_matrix_view<const DataT, IndexT> X,
   raft::device_matrix_view<const DataT, IndexT> centroids,
   raft::device_scalar_view<DataT> cost,
-  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> X_norm        = std::nullopt)
 {
   auto stream     = raft::resource::get_cuda_stream(handle);
   auto n_clusters = centroids.extent(0);
@@ -398,8 +401,18 @@ void cluster_cost(
 
   rmm::device_uvector<char> workspace(n_samples * sizeof(IndexT), stream);
 
-  auto x_norms = raft::make_device_vector<DataT>(handle, n_samples);
-  raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(handle, X, x_norms.view());
+  std::optional<raft::device_vector<DataT, IndexT>> x_norms_buf;
+  DataT* x_norms_ptr;
+  if (X_norm.has_value()) {
+    RAFT_EXPECTS(X_norm->extent(0) == n_samples, "X_norm size !=n_samples");
+    x_norms_ptr = const_cast<DataT*>(X_norm->data_handle());
+  } else {
+    x_norms_buf.emplace(raft::make_device_vector<DataT, IndexT>(handle, n_samples));
+    raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
+      handle, X, x_norms_buf->view());
+    x_norms_ptr = x_norms_buf->data_handle();
+  }
+  auto x_norms_view = raft::make_device_vector_view<DataT, IndexT>(x_norms_ptr, n_samples);
 
   auto min_cluster_distance = raft::make_device_vector<DataT>(handle, n_samples);
   rmm::device_uvector<DataT> l2_norm_or_distance_buffer(0, stream);
@@ -412,7 +425,7 @@ void cluster_cost(
     raft::make_device_matrix_view<DataT, IndexT>(
       const_cast<DataT*>(centroids.data_handle()), n_clusters, n_features),
     min_cluster_distance.view(),
-    x_norms.view(),
+    x_norms_view,
     l2_norm_or_distance_buffer,
     metric,
     n_samples,
@@ -431,31 +444,16 @@ void cluster_cost(
     handle, min_cluster_distance.view(), workspace, cost, raft::add_op{});
 }
 
-/**
- * @brief Compute (optionally weighted) cluster cost (inertia) — host-scalar output.
- *
- * Convenience wrapper that copies the result to host and synchronizes.
- *
- * @tparam DataT  float or double
- * @tparam IndexT Index type
- *
- * @param[in]  handle         The raft handle
- * @param[in]  X              Input data [n_samples x n_features]
- * @param[in]  centroids      Cluster centroids [n_clusters x n_features]
- * @param[out] cost           Sum of squared distances to nearest centroid (host)
- * @param[in]  sample_weight  Optional per-sample weights [n_samples]
- */
 template <typename DataT, typename IndexT>
-void cluster_cost(
-  raft::resources const& handle,
-  raft::device_matrix_view<const DataT, IndexT> X,
-  raft::device_matrix_view<const DataT, IndexT> centroids,
-  raft::host_scalar_view<DataT> cost,
-  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
+void cluster_cost_host(const raft::resources& handle,
+                       raft::device_matrix_view<const DataT, IndexT> X,
+                       raft::device_matrix_view<const DataT, IndexT> centroids,
+                       raft::host_scalar_view<DataT> cost,
+                       std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight)
 {
-  auto device_cost = raft::make_device_scalar<DataT>(handle, DataT(0));
-  cuvs::cluster::kmeans::cluster_cost(handle, X, centroids, device_cost.view(), sample_weight);
-  raft::copy(handle, cost, raft::make_const_mdspan(device_cost.view()));
+  auto d_cost = raft::make_device_scalar<DataT>(handle, DataT{0});
+  cluster_cost(handle, X, centroids, d_cost.view(), sample_weight, std::nullopt);
+  raft::copy(handle, cost, raft::make_const_mdspan(d_cost.view()));
   raft::resource::sync_stream(handle);
 }