Skip to content
Open
11 changes: 10 additions & 1 deletion c/src/cluster/kmeans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include <cuvs/cluster/kmeans.hpp>
#include <cuvs/core/c_api.h>

#include <raft/core/copy.hpp>
#include <raft/core/device_mdarray.hpp>
#include <raft/core/mdspan.hpp>
#include <raft/core/resource/cuda_stream.hpp>

#include "../core/exceptions.hpp"
#include "../core/interop.hpp"

Expand Down Expand Up @@ -213,10 +218,14 @@ void _cluster_cost(cuvsResources_t res,
if (cuvs::core::is_dlpack_device_compatible(X)) {
using mdspan_type = raft::device_matrix_view<const T, IdxT, raft::row_major>;

auto d_cost = raft::make_device_scalar<T>(*res_ptr, T{0});
cuvs::cluster::kmeans::cluster_cost(*res_ptr,
cuvs::core::from_dlpack<mdspan_type>(X_tensor),
cuvs::core::from_dlpack<mdspan_type>(centroids_tensor),
raft::make_host_scalar_view<T>(&cost_temp));
d_cost.view());
raft::copy(
*res_ptr, raft::make_host_scalar_view<T>(&cost_temp), raft::make_const_mdspan(d_cost.view()));
raft::resource::sync_stream(*res_ptr);
} else {
RAFT_FAIL("X dataset must be accessible on device memory");
}
Expand Down
109 changes: 102 additions & 7 deletions cpp/include/cuvs/cluster/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1532,14 +1532,16 @@ void transform(raft::resources const& handle,
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
*
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
* When provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int> X,
raft::device_matrix_view<const float, int> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt);
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const float, int>> X_norm = std::nullopt);
Comment on lines 1538 to +1544

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗄️ Data Integrity & Integration | 🔴 Critical | 🏗️ Heavy lift

CRITICAL: cluster_cost is now source-breaking in the public C++ API.

Switching cost from raft::host_scalar_view to raft::device_scalar_view in cpp/include/cuvs/ removes the old call surface entirely, so existing downstream callers stop compiling with no deprecation path or migration note.

Consider keeping the old host-scalar overloads as deprecated shims for at least one release and documenting the migration. As per coding guidelines, “API changes require deprecation warnings.” As per path instructions for cpp/include/cuvs/**/*, “Breaking changes require deprecation warnings and migration guide updates.”

Also applies to: 1562-1568, 1586-1592, 1610-1616

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/include/cuvs/cluster/kmeans.hpp` around lines 1538 - 1544, `cluster_cost`
in the public kmeans API is source-breaking because `cost` was changed from a
host scalar to a device scalar, so restore compatibility by keeping the existing
host-scalar overloads as deprecated shims alongside the new
`raft::device_scalar_view` overload in `cluster_cost` and the related overloads
at the referenced locations. Update the overload set in `cuvs::cluster::kmeans`
so downstream callers can still compile, emit deprecation warnings from the old
signatures, and add the required migration/deprecation note in the public API
docs for the affected functions.

Sources: Coding guidelines, Path instructions


/**
* @brief Compute cluster cost
Expand All @@ -1554,13 +1556,17 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int> X,
raft::device_matrix_view<const double, int> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt);
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const double, int>> X_norm = std::nullopt);

/**
* @brief Compute (optionally weighted) cluster cost
Expand All @@ -1575,13 +1581,17 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int64_t> X,
raft::device_matrix_view<const float, int64_t> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt);
raft::device_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const float, int64_t>> X_norm = std::nullopt);

/**
* @brief Compute (optionally weighted) cluster cost
Expand All @@ -1596,13 +1606,98 @@ void cluster_cost(
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights.
* [len = n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2,
* i.e. sum of squares without the sqrt) [n_samples]. When
* provided, the internal norm computation is skipped.
*/
void cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int64_t> X,
raft::device_matrix_view<const double, int64_t> centroids,
raft::device_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const double, int64_t>> X_norm = std::nullopt);

/**
* @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
* scalar.
*
* @param[in] handle The raft handle
* @param[in] X Training instances [n_samples x n_features], row-major
* @param[in] centroids Cluster centroids [n_clusters x n_features], row-major
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
[[deprecated(
"Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
"stream sync")]] void
cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int> X,
raft::device_matrix_view<const float, int> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int>> sample_weight = std::nullopt);

/**
* @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
* scalar.
*
* @param[in] handle The raft handle
* @param[in] X Training instances [n_samples x n_features], row-major
* @param[in] centroids Cluster centroids [n_clusters x n_features], row-major
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
[[deprecated(
"Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
"stream sync")]] void
cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int> X,
raft::device_matrix_view<const double, int> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int>> sample_weight = std::nullopt);

/**
* @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
* scalar.
*
* @param[in] handle The raft handle
* @param[in] X Training instances [n_samples x n_features], row-major
* @param[in] centroids Cluster centroids [n_clusters x n_features], row-major
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
[[deprecated(
"Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
"stream sync")]] void
cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const float, int64_t> X,
raft::device_matrix_view<const float, int64_t> centroids,
raft::host_scalar_view<float> cost,
std::optional<raft::device_vector_view<const float, int64_t>> sample_weight = std::nullopt);

/**
* @brief [deprecated] Compute (optionally weighted) cluster cost, writing the result to a host
* scalar.
*
* @param[in] handle The raft handle
* @param[in] X Training instances [n_samples x n_features], row-major
* @param[in] centroids Cluster centroids [n_clusters x n_features], row-major
* @param[out] cost Resulting cluster cost
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
[[deprecated(
"Pass a raft::device_scalar_view; this host-scalar overload forces a D->H copy and "
"stream sync")]] void
cluster_cost(
const raft::resources& handle,
raft::device_matrix_view<const double, int64_t> X,
raft::device_matrix_view<const double, int64_t> centroids,
raft::host_scalar_view<double> cost,
std::optional<raft::device_vector_view<const double, int64_t>> sample_weight = std::nullopt);

/**
* @}
*/
Expand Down
38 changes: 30 additions & 8 deletions cpp/src/cluster/detail/kmeans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,11 @@ void kmeans_fit(
auto centroids_const = raft::make_device_matrix_view<const DataT, IndexT>(
cur_centroids_ptr, n_clusters, n_features);

iter_inertia = DataT{0};
auto d_iter_inertia = raft::make_device_scalar<DataT>(handle, DataT{0});
auto d_batch_cost = raft::make_device_scalar<DataT>(handle, DataT{0});
DataT* p_acc = d_iter_inertia.data_handle();
DataT* p_batch = d_batch_cost.data_handle();

data_batches.reset();
using wt_iter_t = cuvs::spatial::knn::detail::utils::batch_load_iterator_dyn<DataT>;
std::optional<wt_iter_t> wt_it;
Expand All @@ -956,15 +960,33 @@ void kmeans_fit(
cur_batch_weights(static_cast<IndexT>(data_batch.offset()), wt_data, cur_batch_size);
}

DataT batch_cost = DataT{0};
cuvs::cluster::kmeans::cluster_cost(handle,
batch_data_view,
centroids_const,
raft::make_host_scalar_view(&batch_cost),
batch_sw);
std::optional<raft::device_vector_view<const DataT, IndexT>> batch_xnorm = std::nullopt;
if (need_compute_norms) {
if constexpr (data_on_device) {
batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
L2NormBatch.data_handle() + data_batch.offset(), cur_batch_size);
} else if (norms_cached) {
raft::copy(L2NormBatch.data_handle(),
h_norm_cache.data_handle() + data_batch.offset(),
cur_batch_size,
stream);
batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
L2NormBatch.data_handle(), cur_batch_size);
}
Comment on lines +963 to +975

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

HIGH: Guard the uncached host-norm path for zero-iteration fits.

Issue: For host data, this final inertia path always copies from h_norm_cache; if max_iter == 0, the training loop never populated it, so inertia is computed from uninitialized norms.
Why: This returns incorrect final inertia for a valid-looking parameter combination because max_iter is not rejected earlier.

Suggested fix
         if (need_compute_norms) {
           if constexpr (data_on_device) {
             batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
               L2NormBatch.data_handle() + data_batch.offset(), cur_batch_size);
           } else {
-            raft::copy(L2NormBatch.data_handle(),
-                       h_norm_cache.data_handle() + data_batch.offset(),
-                       cur_batch_size,
-                       stream);
+            if (norms_cached) {
+              raft::copy(L2NormBatch.data_handle(),
+                         h_norm_cache.data_handle() + data_batch.offset(),
+                         cur_batch_size,
+                         stream);
+            } else {
+              compute_batch_norms(data_batch.data(), cur_batch_size);
+            }
             batch_xnorm = raft::make_device_vector_view<const DataT, IndexT>(
               L2NormBatch.data_handle(), cur_batch_size);
           }
         }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/src/cluster/detail/kmeans.cuh` around lines 963 - 975, The host data path
(in the else block after checking data_on_device) unconditionally copies from
h_norm_cache without verifying it was populated. When max_iter equals zero, the
training loop never initializes h_norm_cache, causing the raft::copy operation
to read uninitialized memory and compute incorrect final inertia. Add a guard
condition in the else block to check if max_iter is zero, and handle this case
separately (either by skipping the norm computation or computing norms
on-the-fly) before attempting to copy from the unpopulated h_norm_cache.

}

cuvs::cluster::kmeans::cluster_cost(
handle, batch_data_view, centroids_const, d_batch_cost.view(), batch_sw, batch_xnorm);

iter_inertia += batch_cost;
raft::linalg::map_offset(handle,
raft::make_device_vector_view<DataT, int>(p_acc, 1),
[p_acc, p_batch] __device__(int) { return *p_acc + *p_batch; });
}

raft::copy(handle,
raft::make_host_scalar_view<DataT>(&iter_inertia),
raft::make_const_mdspan(d_iter_inertia.view()));
raft::resource::sync_stream(handle);
}

if (iter_inertia < inertia[0]) {
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1139,8 +1139,12 @@ void build_hierarchical(const raft::resources& handle,
reinterpret_cast<const MathT*>(dataset), n_rows, dim);
auto centroids_view =
raft::make_device_matrix_view<const MathT, IdxT>(cluster_centers, n_clusters, dim);
cuvs::cluster::kmeans::cluster_cost(
handle, X_view, centroids_view, raft::make_host_scalar_view<MathT>(inertia));
auto d_inertia = raft::make_device_scalar<MathT>(handle, MathT{0});
cuvs::cluster::kmeans::cluster_cost(handle, X_view, centroids_view, d_inertia.view());
raft::copy(handle,
raft::make_host_scalar_view<MathT>(inertia),
raft::make_const_mdspan(d_inertia.view()));
raft::resource::sync_stream(handle, stream);
Comment on lines +1142 to +1147

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Performance & Scalability | 🟠 Major | ⚡ Quick win

HIGH: final hierarchical inertia still drops the cached dataset norms.

build_hierarchical() has already materialized dataset_norm above, but this cluster_cost() call does not pass it through, so the final inertia path pays for another full row-norm allocation/recompute over n_rows. That gives back much of the win this PR is targeting on the hierarchical path.

Have you considered forwarding dataset_norm here for the L2-based cases that precompute it?

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@cpp/src/cluster/detail/kmeans_balanced.cuh` around lines 1142 - 1147, The
final inertia computation in build_hierarchical is still recomputing row norms
instead of reusing the cached dataset_norm. Update the cluster_cost call in this
hierarchical path to forward dataset_norm for the L2 cases that already
precompute it, and keep the existing fallback behavior for other metrics or
paths. Use the build_hierarchical and cluster_cost symbols to locate the call
site and thread the cached norm through the final inertia calculation.

} else {
RAFT_LOG_WARN("Inertia is not computed for non float/double types");
}
Expand Down
52 changes: 25 additions & 27 deletions cpp/src/cluster/kmeans.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -382,14 +382,17 @@ void min_cluster_distance(raft::resources const& handle,
* @param[in] centroids Cluster centroids [n_clusters x n_features]
* @param[out] cost Sum of squared distances to nearest centroid (device)
* @param[in] sample_weight Optional per-sample weights [n_samples]
* @param[in] X_norm Optional precomputed squared L2 row norms of X (||x||^2) [n_samples].
* When provided, the internal norm computation is skipped.
*/
template <typename DataT, typename IndexT>
void cluster_cost(
raft::resources const& handle,
raft::device_matrix_view<const DataT, IndexT> X,
raft::device_matrix_view<const DataT, IndexT> centroids,
raft::device_scalar_view<DataT> cost,
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt,
std::optional<raft::device_vector_view<const DataT, IndexT>> X_norm = std::nullopt)
{
auto stream = raft::resource::get_cuda_stream(handle);
auto n_clusters = centroids.extent(0);
Expand All @@ -398,8 +401,18 @@ void cluster_cost(

rmm::device_uvector<char> workspace(n_samples * sizeof(IndexT), stream);

auto x_norms = raft::make_device_vector<DataT>(handle, n_samples);
raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(handle, X, x_norms.view());
std::optional<raft::device_vector<DataT, IndexT>> x_norms_buf;
DataT* x_norms_ptr;
if (X_norm.has_value()) {
RAFT_EXPECTS(X_norm->extent(0) == n_samples, "X_norm size !=n_samples");
x_norms_ptr = const_cast<DataT*>(X_norm->data_handle());
} else {
x_norms_buf.emplace(raft::make_device_vector<DataT, IndexT>(handle, n_samples));
raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
handle, X, x_norms_buf->view());
x_norms_ptr = x_norms_buf->data_handle();
}
auto x_norms_view = raft::make_device_vector_view<DataT, IndexT>(x_norms_ptr, n_samples);
Comment thread
coderabbitai[bot] marked this conversation as resolved.

auto min_cluster_distance = raft::make_device_vector<DataT>(handle, n_samples);
rmm::device_uvector<DataT> l2_norm_or_distance_buffer(0, stream);
Expand All @@ -412,7 +425,7 @@ void cluster_cost(
raft::make_device_matrix_view<DataT, IndexT>(
const_cast<DataT*>(centroids.data_handle()), n_clusters, n_features),
min_cluster_distance.view(),
x_norms.view(),
x_norms_view,
l2_norm_or_distance_buffer,
metric,
n_samples,
Expand All @@ -431,31 +444,16 @@ void cluster_cost(
handle, min_cluster_distance.view(), workspace, cost, raft::add_op{});
}

/**
* @brief Compute (optionally weighted) cluster cost (inertia) — host-scalar output.
*
* Convenience wrapper that copies the result to host and synchronizes.
*
* @tparam DataT float or double
* @tparam IndexT Index type
*
* @param[in] handle The raft handle
* @param[in] X Input data [n_samples x n_features]
* @param[in] centroids Cluster centroids [n_clusters x n_features]
* @param[out] cost Sum of squared distances to nearest centroid (host)
* @param[in] sample_weight Optional per-sample weights [n_samples]
*/
template <typename DataT, typename IndexT>
void cluster_cost(
raft::resources const& handle,
raft::device_matrix_view<const DataT, IndexT> X,
raft::device_matrix_view<const DataT, IndexT> centroids,
raft::host_scalar_view<DataT> cost,
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight = std::nullopt)
void cluster_cost_host(const raft::resources& handle,
raft::device_matrix_view<const DataT, IndexT> X,
raft::device_matrix_view<const DataT, IndexT> centroids,
raft::host_scalar_view<DataT> cost,
std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight)
{
auto device_cost = raft::make_device_scalar<DataT>(handle, DataT(0));
cuvs::cluster::kmeans::cluster_cost(handle, X, centroids, device_cost.view(), sample_weight);
raft::copy(handle, cost, raft::make_const_mdspan(device_cost.view()));
auto d_cost = raft::make_device_scalar<DataT>(handle, DataT{0});
cluster_cost(handle, X, centroids, d_cost.view(), sample_weight, std::nullopt);
raft::copy(handle, cost, raft::make_const_mdspan(d_cost.view()));
raft::resource::sync_stream(handle);
}

Expand Down
Loading
Loading