Vectorize requantize_ for Arm64 with NEON intrinsics (#5130)

mcfi · meta-codesync[bot] · commit fe324d642d49 · 2025-11-14T14:40:17.000-08:00
Summary: Pull Request resolved: #5130 X-link: https://github.com/facebookresearch/FBGEMM/pull/2132 This change added a vectorized requantize_ for Arm64 with NEON intrinsics: 1. The newly added NEON intrinsics follows what the existing AVX2 code does. 2. The scalar loop was moved to a new function requantize_i8dw_ref_ to make the code more readable and testable. 3. Added new tests to make requantize_ and requantize_i8dw_ref_ produce identical results. Differential Revision: D86216347
diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include "./FbgemmI8DepthwiseAvx2-inl.h" // @manual
+#include "./FbgemmI8DepthwiseUtils.h" // @manual
 #include "./GenerateI8Depthwise.h" // @manual
 #include "./MaskAvx2.h" // @manual
 #include "fbgemm/Utils.h"
diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -12,7 +12,7 @@
 #include <stdexcept> // for logic_error
 #include <string>
 
-#include "./FbgemmI8DepthwiseAvx2-inl.h" // @manual
+#include "./FbgemmI8DepthwiseUtils.h" // @manual
 #include "./GenerateI8Depthwise.h" // @manual
 #include "./MaskAvx2.h" // @manual
 #include "fbgemm/Utils.h"
diff --git a/src/FbgemmI8DepthwiseUtils.h b/src/FbgemmI8DepthwiseUtils.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <algorithm> // for min and max
+#include <cassert>
+#include <cmath> // for lrintf and sqrt
+#include <cstdint>
+#include <type_traits> // for is_same
+
+#include "fbgemm/FbgemmBuild.h"
+#include "fbgemm/UtilsAvx2.h"
+
+namespace fbgemm {
+
+// Almost same as ReQuantizeOutput in OutputProcessing-inh.h but different
+// row_offsets for each row because of depth-wise convolution
+
+template <
+    bool FUSE_RELU,
+    bool HAS_BIAS,
+    QuantizationGranularity Q_GRAN,
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    int K_PER_G,
+    typename BIAS_TYPE>
+static ALWAYS_INLINE void requantize_i8dw_ref_(
+    std::int32_t A_zero_point,
+    const std::int32_t* B_zero_point,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    const std::int32_t* C_int32,
+    std::uint8_t* C_uint8,
+    int n,
+    int j, // starting index
+    const std::int32_t* row_offsets,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias [[maybe_unused]],
+    const float* act_times_w_scale = nullptr) {
+  for (; j < n; ++j) {
+    std::int32_t raw = C_int32[j];
+    int quant_param_idx = 0;
+    if constexpr (
+        Q_GRAN == QuantizationGranularity::OUT_CHANNEL ||
+        (Q_GRAN == QuantizationGranularity::GROUP && K_PER_G == 1)) {
+      quant_param_idx = j;
+    } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
+      quant_param_idx = j / 2;
+    }
+    if constexpr (!B_SYMMETRIC) {
+      raw -= B_zero_point[quant_param_idx] * row_offsets[j / K_PER_G];
+    }
+    if constexpr (!A_SYMMETRIC) {
+      raw -= A_zero_point * col_offsets[j];
+    }
+    float raw_f = NAN;
+    if constexpr (HAS_BIAS) { // static if
+      if constexpr (std::is_same_v<BIAS_TYPE, float>) {
+        raw_f = raw;
+        raw_f += bias[j] / act_times_w_scale[quant_param_idx];
+      } else {
+        raw += bias[j];
+        raw_f = raw;
+      }
+    } else {
+      raw_f = raw;
+    }
+
+    float ab = raw_f * C_multiplier[quant_param_idx];
+    long rounded = lrintf(ab) + C_zero_point;
+
+    C_uint8[j] = std::max(
+        FUSE_RELU ? static_cast<long>(C_zero_point) : 0l,
+        std::min(255l, rounded));
+  }
+}
+
+static inline std::pair<int, int> closest_factors_(int n) {
+  int a = static_cast<int>(std::sqrt(n));
+  while (n % a != 0) {
+    a--;
+  }
+  return {a, n / a}; // a <= n / a
+}
+
+} // namespace fbgemm
+
+#include "FbgemmI8DepthwiseUtilsAvx2.h"
+#include "FbgemmI8DepthwiseUtilsNeon.h"
diff --git a/src/FbgemmI8DepthwiseUtilsAvx2.h b/src/FbgemmI8DepthwiseUtilsAvx2.h
@@ -8,25 +8,21 @@
 
 #pragma once
 
-#include <algorithm> // for min and max
+#if defined(__x86_64__) || defined(__i386__) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
+
 #include <cassert>
 #include <cmath> // for lrintf and sqrt
 #include <cstdint>
 #include <type_traits> // for is_same
 
-#if defined(__x86_64__) || defined(__i386__) || \
-    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
 #include <immintrin.h>
-#include <math.h>
-#endif
 
 #include "fbgemm/FbgemmBuild.h"
 #include "fbgemm/UtilsAvx2.h"
 
 namespace fbgemm {
 
-// Almost same as ReQuantizeOutput in OutputProcessing-inh.h but different
-// row_offsets for each row because of depth-wise convolution
 template <
     bool FUSE_RELU,
     bool HAS_BIAS,
@@ -503,50 +499,28 @@ static ALWAYS_INLINE void requantize_(
         _mm256_castsi256_si128(x_clamped_v));
   } // j loop vectorized
 
-  for (; j < n; ++j) {
-    std::int32_t raw = C_int32[j];
-    int quant_param_idx = 0;
-    if constexpr (
-        Q_GRAN == QuantizationGranularity::OUT_CHANNEL ||
-        (Q_GRAN == QuantizationGranularity::GROUP && K_PER_G == 1)) {
-      quant_param_idx = j;
-    } else if constexpr (Q_GRAN == QuantizationGranularity::GROUP) {
-      quant_param_idx = j / 2;
-    }
-    if constexpr (!B_SYMMETRIC) {
-      raw -= B_zero_point[quant_param_idx] * row_offsets[j / K_PER_G];
-    }
-    if constexpr (!A_SYMMETRIC) {
-      raw -= A_zero_point * col_offsets[j];
-    }
-    float raw_f = NAN;
-    if constexpr (HAS_BIAS) { // static if
-      if constexpr (std::is_same_v<BIAS_TYPE, float>) {
-        raw_f = raw;
-        raw_f += bias[j] / act_times_w_scale[quant_param_idx];
-      } else {
-        raw += bias[j];
-        raw_f = raw;
-      }
-    } else {
-      raw_f = raw;
-    }
-
-    float ab = raw_f * C_multiplier[quant_param_idx];
-    long rounded = lrintf(ab) + C_zero_point;
-
-    C_uint8[j] = std::max(
-        FUSE_RELU ? static_cast<long>(C_zero_point) : 0l,
-        std::min(255l, rounded));
-  }
-}
-
-static inline std::pair<int, int> closest_factors_(int n) {
-  int a = static_cast<int>(std::sqrt(n));
-  while (n % a != 0) {
-    a--;
-  }
-  return {a, n / a}; // a <= n / a
+  requantize_i8dw_ref_<
+      FUSE_RELU,
+      HAS_BIAS,
+      Q_GRAN,
+      A_SYMMETRIC,
+      B_SYMMETRIC,
+      K_PER_G,
+      BIAS_TYPE>(
+      A_zero_point,
+      B_zero_point,
+      C_multiplier,
+      C_zero_point,
+      C_int32,
+      C_uint8,
+      n,
+      j,
+      row_offsets,
+      col_offsets,
+      bias,
+      act_times_w_scale);
 }
 
 } // namespace fbgemm
+
+#endif
diff --git a/src/FbgemmI8DepthwiseUtilsNeon.h b/src/FbgemmI8DepthwiseUtilsNeon.h
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc