Support 2D weights permute for strided keys

kausv · facebook-github-bot · commit 605c525960a9 · 2025-11-17T19:43:49.000-08:00
Summary: X-link: facebookresearch/FBGEMM#2144 Support 2D weights in 1D length permute tensor kernel. This kernel is invoked on variable strides per rank https://www.internalfb.com/code/fbsource/[5e1d1c3734c75d0664ba817ee05ce9a91a1f02e4]/fbcode/torchrec/sparse/jagged_tensor.py?lines=3111-3134 2D weights are needed for write dist where the weights are actually embedding values. Differential Revision: D87261479
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu
@@ -40,7 +40,8 @@ __global__ __launch_bounds__(kMaxThreads) void permute_1D_data_kernel(
     const offsets_t* __restrict__ input_offsets,
     const offsets_t* __restrict__ output_offsets,
     indices_t* __restrict__ permuted_indices,
-    weights_t* __restrict__ permuted_weights) {
+    weights_t* __restrict__ permuted_weights,
+    int32_t weights_columns) {
   auto b_t_start = blockIdx.x * blockDim.y + threadIdx.y;
   const auto stride = gridDim.x * blockDim.y;
   for (int b_t = b_t_start; b_t < permuted_lengths_size; b_t += stride) {
@@ -55,7 +56,10 @@ __global__ __launch_bounds__(kMaxThreads) void permute_1D_data_kernel(
     for (auto i = threadIdx.x; i < segment_length; i += blockDim.x) {
       permuted_indices[output_start + i] = indices[input_start + i];
       if (has_weight) {
-        permuted_weights[output_start + i] = weights[input_start + i];
+        for (int col = 0; col < weights_columns; ++col) {
+          permuted_weights[(output_start + i) * weights_columns + col] =
+              weights[(input_start + i) * weights_columns + col];
+        }
       }
     }
   }
@@ -138,9 +142,14 @@ permute_1D_sparse_data_cuda(
               using indices_t = scalar_t;
               if (weights.has_value()) {
                 const Tensor weights_value = weights.value();
+                int32_t weights_columns = 1;
+                if (weights_value.dense_dim() > 1) {
+                  weights_columns = weights_value.size(1);
+                }
                 const auto weights_value_contig = weights_value.contiguous();
-                permuted_weights =
-                    at::empty(permuted_indices_size, weights_value.options());
+                permuted_weights = at::empty(
+                    {permuted_indices_size, weights_columns},
+                    weights_value.options());
                 FBGEMM_DISPATCH_ALL_TYPES_AND_DOUBLE(
                     weights_value.scalar_type(),
                     "permute_1D_data_kernel_3",
@@ -164,7 +173,8 @@ permute_1D_sparse_data_cuda(
                           input_offsets.data_ptr<offsets_t>(),
                           output_offsets.data_ptr<offsets_t>(),
                           permuted_indices.data_ptr<indices_t>(),
-                          permuted_weights.data_ptr<weights_t>());
+                          permuted_weights.data_ptr<weights_t>(),
+                          weights_columns);
                     }); // for each weights_t
               } else {
                 FBGEMM_LAUNCH_KERNEL(
@@ -185,7 +195,8 @@ permute_1D_sparse_data_cuda(
                     input_offsets.data_ptr<offsets_t>(),
                     output_offsets.data_ptr<offsets_t>(),
                     permuted_indices.data_ptr<indices_t>(),
-                    nullptr);
+                    nullptr,
+                    0);
               }
             }); // for each indices_t
       }); // for each offsets_t
diff --git a/fbgemm_gpu/test/sparse/permute_sparse_features_test.py b/fbgemm_gpu/test/sparse/permute_sparse_features_test.py
@@ -16,7 +16,7 @@
 import hypothesis.strategies as st
 import torch
 
-from hypothesis import given, settings
+from hypothesis import example, given, Phase, settings, Verbosity
 
 from .common import extend_test_class, open_source, permute_indices_ref_
 
@@ -73,7 +73,12 @@ def permute_sparse_features_ref_(
     )
     @settings(max_examples=20, deadline=None)
     def test_permute_sparse_features(
-        self, B: int, T: int, L: int, long_index: bool, has_weight: bool
+        self,
+        B: int,
+        T: int,
+        L: int,
+        long_index: bool,
+        has_weight: bool,
     ) -> None:
         index_dtype = torch.int64 if long_index else torch.int32
         lengths = torch.randint(low=1, high=L, size=(T, B)).type(index_dtype)
@@ -193,6 +198,112 @@ def test_permute_sparse_features_with_repeats(
                 assert permuted_weights_cpu is None
 
 
+class Permute1DSparseFeaturesTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        B=st.integers(min_value=1, max_value=20),
+        T=st.integers(min_value=1, max_value=20),
+        L=st.integers(min_value=2, max_value=20),
+        long_index=st.booleans(),
+        has_weight=st.booleans(),
+        weight_columns=st.integers(min_value=1, max_value=20),
+    )
+    @settings(
+        max_examples=20,
+        deadline=None,
+    )
+    def test_permute_1D_sparse_data(
+        self,
+        B: int,
+        T: int,
+        L: int,
+        long_index: bool,
+        has_weight: bool,
+        weight_columns: int,
+    ) -> None:
+        index_dtype = torch.int64 if long_index else torch.int32
+        lengths = torch.randint(
+            low=1,
+            high=L,
+            size=(T,),  # 1D
+            device=torch.accelerator.current_accelerator(),
+        ).type(index_dtype)
+        weights = (
+            torch.rand(
+                int(lengths.sum().item()),
+                weight_columns,
+                device=torch.accelerator.current_accelerator(),
+            ).float()
+            if has_weight
+            else None
+        )
+        indices = torch.randint(
+            low=1,
+            high=int(1e5),
+            size=cast(tuple[int, ...], (lengths.sum().item(),)),
+            device=torch.accelerator.current_accelerator(),
+        ).type(index_dtype)
+        permute_list = list(range(T))
+        random.shuffle(permute_list)
+        permute = torch.IntTensor(permute_list).to(
+            device=torch.accelerator.current_accelerator()
+        )
+        (
+            lengths_actual,
+            values_actual,
+            weights_actual,
+        ) = torch.ops.fbgemm.permute_1D_sparse_data(
+            permute, lengths, indices, weights, indices.numel()
+        )
+
+        self.assertTrue(
+            torch.equal(
+                lengths_actual, torch.index_select(lengths, dim=0, index=permute)
+            )
+        )
+        permuted_cumulated_index = 0
+        cumulative_indices = torch.cumsum(
+            torch.cat(
+                (
+                    torch.zeros((1,), dtype=index_dtype, device=lengths.device),
+                    lengths,
+                )
+            ),
+            dim=0,
+        )
+
+        for i in range(T):
+            permuted_index = permute[i]
+            self.assertTrue(
+                torch.equal(
+                    values_actual[
+                        permuted_cumulated_index : permuted_cumulated_index
+                        + lengths[permuted_index]
+                    ],
+                    indices[
+                        cumulative_indices[permuted_index] : lengths[permuted_index]
+                        + cumulative_indices[permuted_index]
+                    ],
+                )
+            )
+            if has_weight and weights is not None:
+                self.assertTrue(
+                    torch.equal(
+                        weights_actual[
+                            permuted_cumulated_index : permuted_cumulated_index
+                            + lengths[permuted_index]
+                        ],
+                        weights[
+                            cumulative_indices[permuted_index] : lengths[permuted_index]
+                            + cumulative_indices[permuted_index]
+                        ],
+                    )
+                )
+            else:
+                assert weights_actual is None
+            permuted_cumulated_index += lengths[permuted_index]
+
+
 class Permute2DSparseFeaturesTest(unittest.TestCase):
     @unittest.skipIf(*gpu_unavailable)
     def test_permute_2D_sparse_data(self) -> None:
@@ -234,6 +345,7 @@ def test_permute_2D_sparse_data(self) -> None:
 
 
 extend_test_class(PermuteSparseFeaturesTest)
+# extend_test_class(Permute1DSparseFeaturesTest)
 
 if __name__ == "__main__":
     unittest.main()