InfiniTensor
diff --git a/‎operatorspy/tests/rotary_embedding.py‎
Lines changed: 25 additions & 27 deletions b/‎operatorspy/tests/rotary_embedding.py‎
Lines changed: 25 additions & 27 deletions
diff --git a/‎src/devices/bang/handle_pool.cc‎
Lines changed: 0 additions & 23 deletions b/‎src/devices/bang/handle_pool.cc‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎src/devices/bang/handle_pool.h‎
Lines changed: 0 additions & 23 deletions b/‎src/devices/bang/handle_pool.h‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎src/ops/matmul/bang/matmul_cnnl.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/ops/matmul/bang/matmul_cnnl.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ops/rotary_embedding/bang/rotary_embedding_bang.cc‎
Lines changed: 74 additions & 0 deletions b/‎src/ops/rotary_embedding/bang/rotary_embedding_bang.cc‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/ops/rotary_embedding/bang/rotary_embedding_bang.h‎
Lines changed: 44 additions & 0 deletions b/‎src/ops/rotary_embedding/bang/rotary_embedding_bang.h‎
Lines changed: 44 additions & 0 deletions
@@ -45,7 +45,7 @@ def rotary_embedding(t, pos, theta, torch_device):
     )
     freqs = torch.outer(pos, freqs)
     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-
+    
     t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2))
     freqs_cis = reshape_for_broadcast(freqs_cis, t_)
     t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype)
@@ -69,19 +69,31 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     print(
         f"Testing Rotary Positional Embedding on {torch_device} with shape:{shape} strides:{strides} and dtype:{dtype}"
     )
-    t = torch.rand(shape, dtype=dtype, device=torch.device(torch_device))
+    
+    t = torch.rand(shape, dtype=dtype)
     if strides is not None:
         t = rearrange_tensor(t, strides)
-    pos = torch.arange(0, t.shape[0], device=torch.device(torch_device))
+    pos = torch.arange(0, t.shape[0])
     theta = 1e4
-    ans = rotary_embedding(t, pos, theta, torch_device)
-    pos = pos.to(torch.int64) # use int64 to support older versions of PyTorch
+    
+    if(torch_device == 'mlu'):
+        ans = rotary_embedding(t, pos, theta, "cpu").to(torch_device)
+        pos = pos.to(torch.int64)
+        pos = pos.to(torch_device)
+        t = t.to(torch_device)
+    else:
+        t = t.to(torch_device)
+        pos = pos.to(torch_device)
+        ans = rotary_embedding(t, pos, theta, torch_device)
+        pos = pos.to(torch.uint64)
+    
     descriptor = infiniopRoPEDescriptor_t()
     # 2x table length for test
     sin_table, cos_table = sin_cos_table(t.shape[0] * 2, t.shape[2], t.device, theta)
     t_tensor = to_tensor(t, lib)
     pos_tensor = to_tensor(pos, lib)
-    pos_tensor.descriptor.contents.dt = U64  # treat int64 as uint64
+    if(torch_device == 'mlu'):
+        pos_tensor.descriptor.contents.dt = U64
     sin_table_tensor = to_tensor(sin_table, lib)
     cos_table_tensor = to_tensor(cos_table, lib)
     check_error(
@@ -111,7 +123,7 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
             None,
         )
     )
-
+    
     assert torch.allclose(t, ans, atol=1e-4, rtol=1e-2)
     check_error(lib.infiniopDestroyRoPEDescriptor(descriptor))
     print("Test passed!")
@@ -135,32 +147,18 @@ def test_cuda(lib, test_cases):
 
 def test_bang(lib, test_cases):
     import torch_mlu
-
     device = DeviceEnum.DEVICE_BANG
-    config = None
-    descriptor = lib.createRotaryEmbeddingDescriptor(device, config)
-
-    # Note: BANG does not support complex calculation, compare with cpu results
-    t = torch.rand((1, 32, 128), dtype=torch.float16)
-    pos = torch.ones((1,), dtype=torch.int32)
-    theta = 1e4
-    ans = rotary_embedding(t, pos, theta, "cpu")
-
-    t = t.to("mlu")
-    pos = pos.to("mlu")
-    lib.rotaryEmbedding(
-        descriptor, to_tensor(t, lib), to_tensor(pos, lib), c_float(theta), None
-    )
-    assert torch.allclose(t.cpu(), ans, atol=1e-3, rtol=1e-3)
-    print("Test passed!")
-
-    lib.destroyRotaryEmbeddingDescriptor(descriptor)
+    handle = create_handle(lib, device)
+    for shape, strides, dtype in test_cases:
+        test(lib, handle, "mlu", shape, strides, dtype)
+    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((1, 32, 128), None, torch.float16),
         ((4, 1, 32), None, torch.float16),
+        ((1, 32, 128), None, torch.float16),
+        
         ((3, 32, 128), (8000, 200, 1), torch.float16),
     ]
     args = get_args()
 
@@ -1,6 +1,6 @@
 #include "matmul_cnnl.h"
+#include "../../../devices/bang/bang_handle.h"
 #include "../../../devices/bang/common_bang.h"
-#include "../../../devices/bang/handle_pool.h"
 #include "../../utils.h"
 #include "cnrt.h"
 infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
 
@@ -0,0 +1,74 @@
+#include "rotary_embedding_bang.h"
+#include "../../utils.h"
+
+
+infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle,
+                                          RoPEBangDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table) {
+
+    if (desc_ptr == nullptr)
+        return STATUS_MEMORY_NOT_ALLOCATED;
+
+    if (t->ndim != 3 ||
+        pos_ids->ndim != 1 ||
+        sin_table->ndim != 2 ||
+        cos_table->ndim != 2)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    auto seq_len = t->shape[0];
+    auto nhead = t->shape[1];
+    auto dim = t->shape[2];
+    auto total_seq_len = sin_table->shape[0];
+
+    if (dim % 2 != 0)
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (pos_ids->shape[0] != seq_len ||
+        sin_table->shape[1] != dim ||
+        cos_table->shape[1] != dim ||
+        sin_table->shape[0] != cos_table->shape[0])
+        return STATUS_BAD_TENSOR_SHAPE;
+
+    if (t->strides[2] != 1 ||
+        pos_ids->strides[0] != 1 ||
+        sin_table->strides[1] != 1 ||
+        cos_table->strides[1] != 1)
+        return STATUS_BAD_TENSOR_STRIDES;
+
+    if (!dtype_eq(t->dt, F16))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(sin_table->dt, F32) || !dtype_eq(cos_table->dt, F32))
+        return STATUS_BAD_TENSOR_DTYPE;
+
+    if (!dtype_eq(pos_ids->dt, U64))
+        return STATUS_BAD_TENSOR_DTYPE;
+    int stride_0 = static_cast<int>(t->strides[0]);
+    int stride_1 = static_cast<int>(t->strides[1]);
+    *desc_ptr = new RoPEBangDescriptor{
+        handle->device,
+        handle->device_id,
+        t->dt,
+        seq_len,
+        nhead,
+        dim,
+        total_seq_len,
+        stride_0, stride_1};
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc) {
+    delete desc;
+    return STATUS_SUCCESS;
+}
@@ -0,0 +1,44 @@
+#ifndef __BANG_ROTARY_EMBEDDING_H__
+#define __BANG_ROTARY_EMBEDDING_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../../utils.h"
+#include "operators.h"
+
+struct RoPEBangDescriptor {
+    Device device;
+    int device_id;
+    DT dtype;
+    uint64_t seq_len;
+    uint64_t nhead;
+    uint64_t dim;
+    uint64_t total_seq_len;
+    int stride_0;
+    int stride_1;
+};
+
+
+typedef struct RoPEBangDescriptor *RoPEBangDescriptor_t;
+
+infiniopStatus_t bangCreateRoPEDescriptor(BangHandle_t handle,
+                                          RoPEBangDescriptor_t *desc_ptr,
+                                          infiniopTensorDescriptor_t t,
+                                          infiniopTensorDescriptor_t pos_ids,
+                                          infiniopTensorDescriptor_t sin_table,
+                                          infiniopTensorDescriptor_t cos_table);
+
+infiniopStatus_t bangGetRoPEWorkspaceSize(RoPEBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangRoPE(RoPEBangDescriptor_t desc,
+                          void *workspace,
+                          uint64_t workspace_size,
+                          void *t,
+                          void const *pos_ids,
+                          void const *sin_table,
+                          void const *cos_table,
+                          void *stream);
+
+infiniopStatus_t bangDestroyRoPEDescriptor(RoPEBangDescriptor_t desc);
+
+
+#endif// __BANG_RMS_NORM_H__