fix(kernel): 稍微调整 MatMulInteger 逻辑

YdrMaster · YdrMaster · commit 416cd2ef6437 · 2023-12-25T10:24:57.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/scripts/compare/compare.py b/scripts/compare/compare.py
@@ -23,6 +23,7 @@ def parse_args():
         args.actual,
     )
 
+
 def getDiff(base, test):
     absolute_diff = np.subtract(base, test)
     max_absolute_diff = np.max(np.abs(absolute_diff))
@@ -35,16 +36,19 @@ def getDiff(base, test):
 
     return max_absolute_diff, max_relative_diff
 
-def compare_npy(actual_path, expect_path, edge, node):
+
+def compare_npy(node, actual_path, expect_path):
     actual = np.load(actual_path)
     expect = np.load(expect_path)
     if np.isnan(actual).any():
-        print(f"NAN value in node:{node} edge:{edge}")
+        print(f"NAN value in node:{node}\t{actual_path}\t{expect_path}")
         return
-    
+
     max_absolute_diff, max_relative_diff = getDiff(expect, actual)
-    if max_absolute_diff != 0.0: ## No need to print tensor with no diff
-        print(f'{max_absolute_diff}\t{max_relative_diff}\t{node}\t{edge}')
+    if max_absolute_diff != 0.0:  ## No need to print tensor with no diff
+        print(
+            f"{max_absolute_diff}\t{max_relative_diff}\t{node}\t{actual_path}\t{expect_path}"
+        )
 
 
 def main():
@@ -70,9 +74,7 @@ def main():
                     expect_file = expect_file + ".npy"
                     expect_file_path = os.path.join(expect_dir, expect_file)
                     if os.path.exists(expect_file_path):
-                        compare_npy(
-                            actual_file_path, expect_file_path, edge_name, node_name
-                        )
+                        compare_npy(meta_file, actual_file_path, expect_file_path)
 
 
 if __name__ == "__main__":
diff --git a/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cu b/src/04kernel/src/kernels/mat_mul_integer/cublas_kernel.cu
@@ -10,7 +10,10 @@ namespace refactor::kernel {
 
     template<class T> __device__ __forceinline__ static int8_t sub(T, T);
     template<> __device__ __forceinline__ int8_t sub<int8_t>(int8_t a, int8_t b) { return a - b; }
-    template<> __device__ __forceinline__ int8_t sub<uint8_t>(uint8_t a, uint8_t b) { return static_cast<int8_t>(static_cast<int16_t>(a) - static_cast<int16_t>(b)); }
+    template<> __device__ __forceinline__ int8_t sub<uint8_t>(uint8_t a, uint8_t b) {
+        constexpr static int16_t MAX = 127;
+        return static_cast<int8_t>(CUB_MIN(MAX, static_cast<int16_t>(a) - static_cast<int16_t>(b)));
+    }
 
     template<class T>
     struct MatMulIntegerZPFunctorScalar {
@@ -33,16 +36,16 @@ namespace refactor::kernel {
     }
 
     template<class T>
-    struct MatMulIntegerZPFunctorA {
-        dim_t m, n;
+    struct MatMulIntegerZPFunctor {
+        dim_t m, n, a, b, c;
         T const *src, *zp;
 
         __device__ int8_t operator()(size_t idx) const noexcept {
             auto
-                // k = idx % n,
+                k = idx % n,
                 j = idx / n % m,
                 i = idx / n / m;
-            return sub(src[idx], zp[i * m + j]);
+            return sub(src[idx], zp[i * a + j * b + k * c]);
         }
     };
 
@@ -52,38 +55,30 @@ namespace refactor::kernel {
         int8_t *dst, void const *src_, void const *zp_) {
         thrust::tabulate(thrust::device,
                          dst, dst + b * m * n,
-                         MatMulIntegerZPFunctorA<T>{
+                         MatMulIntegerZPFunctor<T>{
                              m,
                              n,
+                             m,
+                             1,
+                             0,
                              reinterpret_cast<T const *>(src_),
                              reinterpret_cast<T const *>(zp_),
                          });
     }
 
-    template<class T>
-    struct MatMulIntegerZPFunctorB {
-        dim_t m, n;
-        T const *src, *zp;
-
-        __device__ int8_t operator()(size_t idx) const noexcept {
-            auto
-                k = idx % n,
-                // j = idx / n % m,
-                i = idx / n / m;
-            return sub(src[idx], zp[i * n + k]);
-        }
-    };
-
     template<class T>
     static void applyZeroPointB(
         dim_t b, dim_t m, dim_t n,
         int8_t *dst, void const *src_, void const *zp_) {
 
         thrust::tabulate(thrust::device,
                          dst, dst + b * m * n,
-                         MatMulIntegerZPFunctorB<T>{
+                         MatMulIntegerZPFunctor<T>{
                              m,
                              n,
+                             n,
+                             0,
+                             1,
                              reinterpret_cast<T const *>(src_),
                              reinterpret_cast<T const *>(zp_),
                          });