Zgc/diopi ascend fix copy (DeepLink-org#850)

zhaoguochun1995 · POI-WX · web-flow · commit a02796a893c1 · 2024-01-12T17:49:57.000+08:00
* fix copy bug.

* enhance copy

---------

Co-authored-by: wangxing &lt;131418410+POI-WX@users.noreply.github.com&gt;
diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py
@@ -1356,7 +1356,6 @@
             args=[
                 {
                     "ins": ["input"],
-                    "shape": [Skip((6, 5, 384)), Skip((2, 4, 38, 45))],
                     "dtype": [Skip(np.complex128), Skip(np.complex64)],
                 },
                 {
@@ -1374,7 +1373,6 @@
             args=[
                 {
                     "ins": ["input"],
-                    "shape": [Skip((192, 147)), Skip((192, 147, 2)), Skip((2, 12, 38, 45, 3))],
                     "dtype": [Skip(np.complex128), Skip(np.complex64)],
                 },
                 {
diff --git a/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp b/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp
@@ -46,6 +46,9 @@ bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& sr
 namespace {
 
 std::vector<int64_t> inferOriginShape(at::IntArrayRef sizes, at::IntArrayRef strides) {
+    if (sizes.size() <= 0) {
+        return std::vector<int64_t>();
+    }
     std::vector<int64_t> originSizes(sizes.size(), 1);
     originSizes[0] = sizes[0] * strides[0];
     for (size_t i = 1; i < sizes.size(); i++) {
@@ -57,6 +60,30 @@ std::vector<int64_t> inferOriginShape(at::IntArrayRef sizes, at::IntArrayRef str
     return originSizes;
 }
 
+at::Tensor viewToSameDim(const at::Tensor& tensor, const at::IntArrayRef destShape) {
+    const auto originShape = tensor.sizes();
+    std::vector<int64_t> strides(destShape.size(), 0);
+    if (originShape.size() < destShape.size()) {
+        std::vector<int64_t> sameDims;
+        for (int i = destShape.size() - 1; i >= 0; i--) {
+            for (int j = originShape.size() - 1 - sameDims.size(); j >= 0; j--) {
+                if (destShape[i] == originShape[j]) {
+                    sameDims.push_back(i);
+                    strides[i] = tensor.strides()[j];
+                    break;
+                }
+            }
+        }
+    } else if (originShape.size() == destShape.size()) {
+        for (size_t i = 0; i < destShape.size(); i++) {
+            if (destShape[i] == originShape[i]) {
+                strides[i] = tensor.stride(i);
+            }
+        }
+    }
+    return impl::aten::viewStorage(tensor, destShape, strides);
+}
+
 }  // namespace
 
 bool isPartOfOther(const at::Tensor& tensor) {
@@ -78,16 +105,18 @@ at::Tensor& npu_view_copy(at::Tensor& self, const at::Tensor& src, bool non_bloc
     auto self_stride = self.strides();
     auto src_size = src.sizes();
     auto src_stride = src.strides();
-    auto originShape = inferOriginShape(self.sizes(), self.strides());
-    auto originSizeTensor = at_npu::native::empty_npu(originShape, self.options());
+    auto originSelfShape = inferOriginShape(self.sizes(), self.strides());
+    auto originSizeTensor = at_npu::native::empty_npu(originSelfShape, self.options());
+
+    auto originSrcShape = inferOriginShape(src.sizes(), src.strides());
 
     at_npu::native::OpCommand cmd;
     cmd.Name("ViewCopy")
-        .InputWithoutContiguous(impl::aten::viewStorage(self, originShape))
+        .InputWithoutContiguous(impl::aten::viewStorage(self, originSelfShape))
         .Input(self_size, at::kLong, at_npu::native::CompileType::MEMORY_HOST_COMPILE_INDEPENDENT)
         .Input(self_stride, at::kLong, at_npu::native::CompileType::MEMORY_HOST_COMPILE_INDEPENDENT)
         .Input(at::Scalar(0), at::kLong)
-        .InputWithoutContiguous(src)
+        .InputWithoutContiguous(impl::aten::viewStorage(src, originSrcShape))
         .Input(src_size, at::kLong, at_npu::native::CompileType::MEMORY_HOST_COMPILE_INDEPENDENT)
         .Input(src_stride, at::kLong, at_npu::native::CompileType::MEMORY_HOST_COMPILE_INDEPENDENT)
         .Input(at::Scalar(0), at::kLong)
@@ -102,9 +131,8 @@ at::Tensor& npu_view_copy(at::Tensor& self, const at::Tensor& src, bool non_bloc
 void copy_d2d_last_method(at::Tensor& self, const at::Tensor& src, bool same_type, bool non_blocking) {
     // general copy method but Low performance
     RECORD_FUNCTION("contiguous_d_ViewCopy", std::vector<c10::IValue>({src}));
-    if (isPartOfOther(self)) {
+    if (1 || isPartOfOther(self)) {
         npu_view_copy(self, src, non_blocking);
-        // custom_ops::npu_view_copy(self, src, non_blocking);
     } else {
         custom_ops::npu_view_copy(self, src, non_blocking);
     }
@@ -314,16 +342,20 @@ void copy_d2d_dtype_baseformat(at::Tensor& self, const at::Tensor& src, bool non
             // Optimized trans-contiguous method
             return;
         } else {
-            // General trans-contiguous method
-            RECORD_FUNCTION("contiguous_d_AsStrided", std::vector<c10::IValue>({src}));
-#if 0
-            custom_ops::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self);
-#else
-            std::vector<int64_t> shape(src.sizes().size(), 1);
-            shape[0] = at::detail::computeStorageNbytes(src.sizes(), src.strides(), src.itemsize()) / src.itemsize();
-            custom_ops::npu_stride_copy_out(impl::aten::viewStorage(src, shape), src.sizes(), src.strides(), src.storage_offset(), self);
-#endif
-            return;
+            // AsStride not support double
+            if (src.scalar_type() != at::kDouble) {
+                // General trans-contiguous method
+                RECORD_FUNCTION("contiguous_d_AsStrided", std::vector<c10::IValue>({src}));
+                at::Tensor source = src;
+                if (self.sizes() != src.sizes()) {
+                    source = viewToSameDim(source, self.sizes());
+                }
+
+                // custom_ops::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self);
+                auto shape = inferOriginShape(source.sizes(), source.strides());
+                custom_ops::npu_stride_copy_out(impl::aten::viewStorage(source, shape), source.sizes(), source.strides(), source.storage_offset(), self);
+                return;
+            }
         }
     } else {
         // Contiguous source tensor copy to contiguous self tensor
@@ -459,7 +491,7 @@ class BroadcastContiguousOpt : public ContiguousOpt {
     }
 };  // class BroadcastContiguousOpt
 
-REGISTER_COPY_OPT(broadcast, BroadcastContiguousOpt)
+// REGISTER_COPY_OPT(broadcast, BroadcastContiguousOpt)
 
 constexpr int MaxCombinedCasesNum = 2;
 constexpr int ViewAndBaseInfoStackNum = 2;
@@ -831,7 +863,7 @@ class CombinedContiguousOpt : public ContiguousOpt {
     }
 };  // class combinedContiguousOpt
 
-REGISTER_COPY_OPT(combined, CombinedContiguousOpt)
+// REGISTER_COPY_OPT(combined, CombinedContiguousOpt)
 
 class IndexingContiguousOpt : public ContiguousOpt {
 public:
@@ -945,7 +977,7 @@ class IndexingContiguousOpt : public ContiguousOpt {
     }
 };  // class IndexingContiguousOpt
 
-REGISTER_COPY_OPT(indexing, IndexingContiguousOpt)
+// REGISTER_COPY_OPT(indexing, IndexingContiguousOpt)
 
 class PermuteContiguousOpt : public ContiguousOpt {
 public:
@@ -1106,7 +1138,7 @@ class PermuteContiguousOpt : public ContiguousOpt {
     }
 };  // class PermuteContiguousOpt
 
-REGISTER_COPY_OPT(permute, PermuteContiguousOpt)
+// REGISTER_COPY_OPT(permute, PermuteContiguousOpt)
 
 bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc& tensor_desc) {
     int64_t tensor_shape_size = static_cast<int64_t>(tensor_desc.sizes_.size());
@@ -1193,7 +1225,7 @@ class ReshapeContiguousOpt : public ContiguousOpt {
     bool CanOptimizer(const ContiguousTensorDesc& src_desc) override { return check_reshape_match(src_desc); }
 };  // class ReshapeContiguousOpt
 
-REGISTER_COPY_OPT(reshape, ReshapeContiguousOpt)
+// REGISTER_COPY_OPT(reshape, ReshapeContiguousOpt)
 
 class ReshapeV2ContiguousOpt : public ContiguousOpt {
 public:
@@ -1269,7 +1301,7 @@ class ReshapeV2ContiguousOpt : public ContiguousOpt {
     }
 };  // class ReshapeV2ContiguousOpt
 
-REGISTER_COPY_OPT(reshapeV2, ReshapeV2ContiguousOpt)
+// REGISTER_COPY_OPT(reshapeV2, ReshapeV2ContiguousOpt)
 
 class SelectContiguousOpt : public ContiguousOpt {
 public:
@@ -1381,7 +1413,7 @@ class SelectContiguousOpt : public ContiguousOpt {
     }
 };  // class SelectContiguousOpt
 
-REGISTER_COPY_OPT(select, SelectContiguousOpt)
+// REGISTER_COPY_OPT(select, SelectContiguousOpt)
 
 class SliceContiguousOpt : public ContiguousOpt {
 public:
@@ -1489,7 +1521,7 @@ class SliceContiguousOpt : public ContiguousOpt {
     }
 };  // class SliceContiguousOpt
 
-REGISTER_COPY_OPT(slice, SliceContiguousOpt)
+// REGISTER_COPY_OPT(slice, SliceContiguousOpt)
 
 }  // namespace native
 }  // namespace at_npu
diff --git a/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp b/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp
@@ -2400,7 +2400,10 @@ std::tuple<aclTensorDesc*, aclDataBuffer*> CovertToAclOutput(const at::Tensor& t
     aclDataType aclDataType = CalcuOpUtil::ConvertToAclDataType(tensor.scalar_type(), forceDataType);
     const auto& npuDesc = torch_npu::NPUBridge::GetNpuStorageImplDesc(tensor);
     const auto& dims = tensor.sizes();
-    auto& storageDims = npuDesc.storage_sizes_;
+    auto storageDims = npuDesc.storage_sizes_;
+    if (storageDims.size() == 0 && tensor.numel() > 0) {
+        storageDims.push_back(1);
+    }
     AclTensorDescMaker desc;
     auto aclDesc = desc.Create(aclDataType, dims, npuDesc.origin_format_).SetFormat(npuDesc.npu_format_).SetShape(storageDims).Get();
     auto numel = c10::multiply_integers(storageDims);

Original file line number	Diff line number	Diff line change
`@@ -1356,7 +1356,6 @@`
`1356`	`1356`	`args=[`
`1357`	`1357`	`{`
`1358`	`1358`	`"ins": ["input"],`
`1359`		`- "shape": [Skip((6, 5, 384)), Skip((2, 4, 38, 45))],`
`1360`	`1359`	`"dtype": [Skip(np.complex128), Skip(np.complex64)],`
`1361`	`1360`	`},`
`1362`	`1361`	`{`
`@@ -1374,7 +1373,6 @@`
`1374`	`1373`	`args=[`
`1375`	`1374`	`{`
`1376`	`1375`	`"ins": ["input"],`
`1377`		`- "shape": [Skip((192, 147)), Skip((192, 147, 2)), Skip((2, 12, 38, 45, 3))],`
`1378`	`1376`	`"dtype": [Skip(np.complex128), Skip(np.complex64)],`
`1379`	`1377`	`},`
`1380`	`1378`	`{`