Update fpgemm fp8 conv heuristic (#5118)

zjing14 · meta-codesync[bot] · commit b2569b475572 · 2025-11-12T05:30:32.000-08:00
Summary: Pull Request resolved: #5118 X-link: https://github.com/facebookresearch/FBGEMM/pull/2124 - Update fp8 conv heuristic for D86440061 Reviewed By: jwfromm Differential Revision: D86558446 fbshipit-source-id: 6b8b4fff190b2f98181ac018576bdbbbe940d256
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv.cu
@@ -38,8 +38,24 @@ struct ProblemSize {
   std::vector<int64_t> dilation;
   bool operator==(const ProblemSize& ps) const {
     return activation_shape == ps.activation_shape &&
-        filter_shape == ps.filter_shape && padding == ps.padding &&
-        stride == ps.stride && dilation == ps.dilation;
+        filter_shape == ps.filter_shape;
+  }
+  void print() const {
+    // clang-format off
+    std::cout << "actv: " // [N, D, H, W, C]
+              << activation_shape[0] << ","
+              << activation_shape[1] << ","
+              << activation_shape[2] << ","
+              << activation_shape[3] << ","
+              << activation_shape[4] << ","
+              << "filter: " // [K, T, R, S, C]
+              << filter_shape[0] << ","
+              << filter_shape[1] << ","
+              << filter_shape[2] << ","
+              << filter_shape[3] << ","
+              << filter_shape[4] << ","
+              << std::endl;
+    // clang-format on
   }
 };
 
@@ -59,42 +75,43 @@ struct ProblemSizeHash {
     };
     hash_combine(seed, vec_hash(ps.activation_shape));
     hash_combine(seed, vec_hash(ps.filter_shape));
-    hash_combine(seed, vec_hash(ps.padding));
-    hash_combine(seed, vec_hash(ps.stride));
-    hash_combine(seed, vec_hash(ps.dilation));
+    // hash_combine(seed, vec_hash(ps.padding));
+    // hash_combine(seed, vec_hash(ps.stride));
+    // hash_combine(seed, vec_hash(ps.dilation));
     return seed;
   }
 };
 
 // clang-format off
 std::unordered_map<ProblemSize, Kernel_f8f8bf16_conv, ProblemSizeHash> kernel_map = {
-    {{{1,6,32,48,48}, {48,1,1,1,48}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x128x128_1x1x1},
-    {{{1,3,34,50,48}, {1024,3,3,3,48}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x1x1},
-    {{{1,3,34,50,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
-    {{{1,3,66,98,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x1x1},
-    {{{1,3,130,194,1024}, {512,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
-    {{{1,3,130,194,512}, {512,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x512x128_2x2x1},
-    {{{1,1,128,192,1024}, {512,1,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
-    {{{1,3,258,386,512}, {256,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
-    {{{1,3,258,386,256}, {256,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,1,256,384,512}, {256,1,1,1,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-  //{{{1,3,258,386,256}, {12,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_invalid},
-    {{{1,3,32,48,1024}, {2048,3,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x2x1},
-    {{{1,4,66,98,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x512x128_2x2x1},
-    {{{1,4,64,96,1024}, {2048,3,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,6,130,194,1024}, {512,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,6,130,194,512}, {512,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,4,128,192,1024}, {512,1,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
-    {{{1,6,258,386,512}, {256,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,6,258,386,256}, {256,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,4,256,384,512}, {256,1,1,1,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
-  //{{{1,6,258,386,256}, {12,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_invalid},
-    {{{1,1,64,96,1024}, {1024,1,3,3,1024}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,1,128,192,1024}, {1024,1,3,3,1024}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{1,1,256,384,512}, {512,1,3,3,512}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{2,1,64,96,1024}, {1024,1,3,3,1024}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{4,1,128,192,1024}, {1024,1,3,3,1024}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
-    {{{4,1,256,384,512}, {512,1,3,3,512}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1}
+{{{1,1,192,128,1024}, {512,1,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
+{{{1,1,192,128,160}, {320,1,1,1,160}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,1,384,256,512}, {256,1,1,1,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,1,96,64,320}, {640,1,1,1,320}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_1x2x1},
+{{{1,3,194,130,1024}, {512,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,3,194,130,160}, {320,3,3,3,160}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,3,194,130,320}, {320,3,3,3,320}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x128x128_4x1x1},
+{{{1,3,194,130,512}, {512,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,3,386,258,160}, {160,3,3,3,160}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,3,386,258,256}, {256,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,3,386,258,512}, {256,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,3,48,32,1024}, {2048,3,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x128x128_4x1x1},
+{{{1,3,50,34,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
+{{{1,3,50,34,48}, {1024,3,3,3,48}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x1x1},
+{{{1,3,50,34,640}, {640,3,3,3,640}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x128x128_4x1x1},
+{{{1,3,50,34,640}, {96,3,3,3,640}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x2x1},
+{{{1,3,98,66,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x1x1},
+{{{1,3,98,66,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_4x1x1},
+{{{1,3,98,66,320}, {640,3,3,3,320}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,3,98,66,640}, {640,3,3,3,640}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
+{{{1,4,192,128,1024}, {512,1,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_128x256x128_2x1x1},
+{{{1,4,384,256,512}, {256,1,1,1,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,4,96,64,1024}, {2048,3,1,1,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,4,98,66,1024}, {1024,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_512x256x128_4x1x1},
+{{{1,6,194,130,1024}, {512,3,3,3,1024}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,6,194,130,512}, {512,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,6,386,258,256}, {256,3,3,3,256}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
+{{{1,6,386,258,512}, {256,3,3,3,512}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, f8f8bf16_conv_256x256x128_2x1x1},
 };
 // clang-format on
 
@@ -114,6 +131,9 @@ Kernel_f8f8bf16_conv get_kernel_via_heuristic(
   auto it = kernel_map.find(ps);
   if (it != kernel_map.end()) {
     return it->second;
+  } else {
+    std::cout << "warning: not found";
+    ps.print();
   }
   // Fallback kernel
   return f8f8bf16_conv_256x256x128_2x1x1;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_128x256x128_1x2x1.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_128x256x128_1x2x1.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_conv_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_conv_128x256x128_1x2x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation) { // [dilation_d, dilation_h, dilation_w]
+
+  return f8f8bf16_conv_impl<
+      128,
+      128,
+      128,
+      1,
+      2,
+      1,
+      cutlass::conv::KernelImplicitTmaWarpSpecialized1SmSm100>(
+      activation, filter, scale, padding, stride, dilation);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_256x128x128_4x1x1.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_256x128x128_4x1x1.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_conv_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_conv_256x128x128_4x1x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation) { // [dilation_d, dilation_h, dilation_w]
+
+  return f8f8bf16_conv_impl<
+      128,
+      128,
+      128,
+      4,
+      1,
+      1,
+      cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100>(
+      activation, filter, scale, padding, stride, dilation);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_256x256x128_4x2x1.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_256x256x128_4x2x1.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f8f8bf16_conv_common.cuh"
+
+namespace fbgemm_gpu {
+
+at::Tensor f8f8bf16_conv_256x256x128_4x2x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation) { // [dilation_d, dilation_h, dilation_w]
+
+  return f8f8bf16_conv_impl<
+      128,
+      128,
+      128,
+      4,
+      2,
+      1,
+      cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100>(
+      activation, filter, scale, padding, stride, dilation);
+}
+
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_manifest.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_conv/f8f8bf16_conv_manifest.cuh
@@ -26,6 +26,22 @@ at::Tensor f8f8bf16_conv_128x128x128_1x1x1(
     std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
     std::vector<int64_t> dilation);
 
+at::Tensor f8f8bf16_conv_128x256x128_1x2x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation);
+
+at::Tensor f8f8bf16_conv_256x128x128_4x1x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation);
+
 at::Tensor f8f8bf16_conv_128x256x128_2x1x1(
     at::Tensor activation, // FP8 - NDHWC layout
     at::Tensor filter, // FP8 - KTRSC layout
@@ -58,6 +74,14 @@ at::Tensor f8f8bf16_conv_256x256x128_4x1x1(
     std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
     std::vector<int64_t> dilation);
 
+at::Tensor f8f8bf16_conv_256x256x128_4x2x1(
+    at::Tensor activation, // FP8 - NDHWC layout
+    at::Tensor filter, // FP8 - KTRSC layout
+    at::Tensor scale,
+    std::vector<int64_t> padding, // [pad_d, pad_h, pad_w]
+    std::vector<int64_t> stride, // [stride_d, stride_h, stride_w]
+    std::vector<int64_t> dilation);
+
 at::Tensor f8f8bf16_conv_256x512x128_2x2x1(
     at::Tensor activation, // FP8 - NDHWC layout
     at::Tensor filter, // FP8 - KTRSC layout