InfiniTensor
diff --git a/‎env.sh‎
Lines changed: 0 additions & 6 deletions b/‎env.sh‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎operatorspy/tests/clip.py‎
Lines changed: 11 additions & 14 deletions b/‎operatorspy/tests/clip.py‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎operatorspy/tests/where.py‎
Lines changed: 44 additions & 18 deletions b/‎operatorspy/tests/where.py‎
Lines changed: 44 additions & 18 deletions
diff --git a/‎src/ops/clip/cuda/clip_cuda.cc‎
Lines changed: 0 additions & 4 deletions b/‎src/ops/clip/cuda/clip_cuda.cc‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/ops/clip/cuda/clip_cuda.cu‎
Lines changed: 17 additions & 3 deletions b/‎src/ops/clip/cuda/clip_cuda.cu‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/ops/clip/cuda/clip_cuda.h‎
Lines changed: 0 additions & 2 deletions b/‎src/ops/clip/cuda/clip_cuda.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/ops/clip/operator.cc‎
Lines changed: 0 additions & 1 deletion b/‎src/ops/clip/operator.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/ops/utils.h‎
Lines changed: 28 additions & 0 deletions b/‎src/ops/utils.h‎
Lines changed: 28 additions & 0 deletions
@@ -21,7 +21,7 @@
 from typing import Tuple
 import numpy as np
 
-PROFILE = False
+PROFILE = True
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
@@ -112,28 +112,21 @@ def test(
             )
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"lib time: {elapsed :10f}")
-    print("x:", x)
-    print("custom op ans:", output)
-    print("ans:", ans) if max != None or min != None else print("ans:", x)
     assert torch.allclose(output, ans, atol=0, rtol=0) if max != None or min != None else torch.allclose(output, x, atol=0, rtol=0)
     check_error(lib.infiniopDestroyClipDescriptor(descriptor))
 
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for x_shape, min, max in test_cases:
-        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=torch.float16)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
+    for x_shape, min, max, tensor_type in test_cases:
+        test(lib, handle, "cpu", x_shape, min, max, tensor_dtype=tensor_type)
     destroy_handle(lib, handle)
 
 def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, min, max, tensor_type in test_cases:
         test(lib, handle, "cuda", x_shape, min, max, tensor_dtype=tensor_type)
-        print("\n")
-        #test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
     destroy_handle(lib, handle)
 
 
@@ -145,15 +138,16 @@ def test_cuda(lib, test_cases):
         ((3, 4), None, None, torch.float32),
         ((16), -1, 1, torch.float32),
         ((1024, 1024), -1, 1, torch.float32),
-
+        ((4096, 4096), -1, 1, torch.float32),
+        
+        ((13), -1, 1, torch.float32),
         ((3, 4), -1, 1, torch.float16),
         ((3, 4), None, 1, torch.float16),
         ((3, 4), -1, None, torch.float16),
         ((3, 4), None, None, torch.float16),
         ((16), -1, 1, torch.float16),
         ((1024, 1024), -1, 1, torch.float16),
-
-        # stride = 
+        ((4096, 4096), -1, 1, torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -175,5 +169,8 @@ def test_cuda(lib, test_cases):
     ]
     lib.infiniopDestroyClipDescriptor.restype = c_int32
     lib.infiniopDestroyClipDescriptor.argtypes = [infiniopClipDescriptor_t]
-    test_cuda(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
     print("All tests passed!")
@@ -20,9 +20,8 @@
 import torch
 from typing import Tuple
 import numpy as np
-import onnx
 
-PROFILE = True
+PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
@@ -43,15 +42,22 @@ def tuple_to_void_p(py_tuple: Tuple):
 def inferShape(x_shape, y_shape):
     ndim_x = len(x_shape)
     ndim_y = len(y_shape)
-    ndim = 0
-    output_shape = []
     ndim = max(ndim_x, ndim_y)
-    for i in range(ndim - 1, -1, -1):
-        dim_x = x_shape[i] if i < ndim_x else 1
-        dim_y = y_shape[i] if i < ndim_y else 1
-        output_shape.append(max(dim_x, dim_y))
-    output_shape.reverse()
+    output_shape = []
+    
+    for i in range(-1, -ndim-1, -1):
+        dim_x = x_shape[i] if i >= -ndim_x else 1
+        dim_y = y_shape[i] if i >= -ndim_y else 1
+        
+        if dim_x != dim_y:
+            if dim_x != 1 and dim_y != 1:
+                raise ValueError(f"Shapes {x_shape} and {y_shape} cannot be broadcast together")
+        
+        output_dim = max(dim_x, dim_y)
+        output_shape.insert(0, output_dim)
+    
     return tuple(output_shape)
+
 
 def test(
     lib,
@@ -68,8 +74,7 @@ def test(
     condition = torch.randint(0, 2, condition_shape, dtype=torch.uint8).to(torch_device)
     src1 = torch.randn(src1_shape, dtype=tensor_dtype, device=torch_device)
     src2 = torch.randn(src2_shape, dtype=tensor_dtype, device=torch_device)
-    output = torch.randn(inferShape(src1_shape, src2_shape), dtype=tensor_dtype, device=torch_device)
-
+    output = torch.randn(inferShape(inferShape(src1_shape, src2_shape), condition_shape), dtype=tensor_dtype, device=torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = where(condition, src1, src2)
@@ -130,18 +135,33 @@ def test(
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for condition_shape, src1_shape, src2_shape in test_cases:
-        test(lib, handle, "cpu", condition_shape, src1_shape, src2_shape, tensor_dtype=torch.float16)
+    for condition_shape, src1_shape, src2_shape, tensor_dtype in test_cases:
+        test(lib, handle, "cpu", condition_shape, src1_shape, src2_shape, tensor_dtype=tensor_dtype)
+        print("\n")
+    destroy_handle(lib, handle)
+
+def test_cuda(lib, test_cases):
+    device = DeviceEnum.DEVICE_CUDA
+    handle = create_handle(lib, device)
+    for condition_shape, src1_shape, src2_shape, tensor_dtype in test_cases:
+        test(lib, handle, "cuda", condition_shape, src1_shape, src2_shape, tensor_dtype=tensor_dtype)
         print("\n")
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
-        ((2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5)),
-        ((3, 1), (3, 4), (1, 4)),
-        ((1,), (3, 4), (3, 4)),
-        ((2, 1, 3), (1, 4, 3), (2, 4, 1)),
+        ((2, 16), (2, 16), (2, 16), torch.float32),
+        ((2, 3, 1, 1), (1, 4, 5), (2, 3, 4, 5), torch.float32),
+        ((3, 1), (3, 4), (1, 4), torch.float32),
+        ((1,), (3, 4), (3, 4), torch.float32),
+        ((2, 1, 3), (1, 4, 3), (2, 4, 1), torch.float32),
+
+        ((2, 16), (2, 16), (2, 16), torch.float16),
+        ((2, 3, 1, 1), (1, 4, 5), (2, 3, 4, 5), torch.float16),
+        ((3, 1), (3, 4), (1, 4), torch.float16),
+        ((1,), (3, 4), (3, 4), torch.float16),
+        ((2, 1, 3), (1, 4, 3), (2, 4, 1), torch.float16),
     ]
     args = get_args()
     lib = open_lib()
@@ -150,6 +170,9 @@ def test_cpu(lib, test_cases):
         infiniopHandle_t,
         POINTER(infiniopWhereDescriptor_t),
         infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
     ]
     lib.infiniopWhere.restype = c_int32
     lib.infiniopWhere.argtypes = [
@@ -162,5 +185,8 @@ def test_cpu(lib, test_cases):
     ]
     lib.infiniopDestroyWhereDescriptor.restype = c_int32
     lib.infiniopDestroyWhereDescriptor.argtypes = [infiniopWhereDescriptor_t]
-    test_cpu(lib, test_cases)
+    if args.cpu:
+        test_cpu(lib, test_cases)
+    if args.cuda:
+        test_cuda(lib, test_cases)
     print("All tests passed!")
@@ -24,15 +24,11 @@ infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle,
         element_num *= x->shape[i];
     }
     uint64_t ndim = y->ndim;
-    uint64_t S = ndim == 2 ? y->shape[0] : 1;
-    uint64_t K = ndim == 2 ? y->shape[1] : 1;
     *desc_ptr = new ClipCudaDescriptor{
         DevNvGpu,
         x->dt,
         ndim,
         element_num,
-        S,
-        K
     };
     return STATUS_SUCCESS;
 }
 
@@ -37,11 +37,17 @@ __global__ void clip_f32x4_kernel(float *a, float *b, float max_value, float min
 
 __global__ void clip_f16x8_pack_kernel(half *a, half *b, float max_value, float min_value, int N){
     int idx = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
+    if (idx >= N) return;
     const half min_half = __float2half(min_value);
     const half max_half = __float2half(max_value);
-    if (idx >= N) return;
     half pack_a[8], pack_b[8];
-    LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
+    if (idx + 7 < N) {
+        LDST128BITS(pack_a[0]) = LDST128BITS(a[idx]);
+    } else {
+        for (int i = 0; i < 8 && (idx + i) < N; i++) {
+            pack_a[i] = a[idx + i];
+        }
+    }
     #pragma unroll
     for (int i = 0; i < 8; i++)
     {
@@ -66,6 +72,14 @@ infiniopStatus_t clip_nv_gpu(
     int per_thread_element,
     void* stream) {
     uint64_t N = desc->element_num;
+    dim3 block(256 / per_thread_element);
+    dim3 grid((N + 256 - 1) / 256);
+    if constexpr(std::is_same<Tdata, float>::value){
+        clip_f32x4_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<float *>(x), reinterpret_cast<float *>(y), max_value, min_value, N);
+    }else{
+        clip_f16x8_pack_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(reinterpret_cast<half *>(x), reinterpret_cast<half *>(y), max_value, min_value, N);
+    }
+    /*
     if (desc->ndim != 2){
         dim3 block(256 / per_thread_element);
         dim3 grid((N + 256 - 1) / 256);
@@ -94,6 +108,7 @@ infiniopStatus_t clip_nv_gpu(
             }
         }
     }
+        */
     return STATUS_SUCCESS;
 }
 
@@ -105,7 +120,6 @@ infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc,
     void *stream){
     bool has_min = true;
     bool has_max = true;
-    uint64_t N =  desc->element_num;
     if (min == nullptr){
         has_min = false;
     }
 
@@ -10,8 +10,6 @@ typedef struct ClipCudaDescriptor {
     DT dtype;
     uint64_t ndim;
     uint64_t element_num;
-    uint64_t S;
-    uint64_t K;
 } ClipCudaDescriptor;
 
 typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t;
 
@@ -28,7 +28,6 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(
         }
 #endif
     }
-    std::cout << "Creating Clip Descriptorxx" << std::endl;
     return STATUS_BAD_DEVICE;
 }
 
 
@@ -104,6 +104,34 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
     return true;
 }
 
+inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
+                              const uint64_t *shape2, uint64_t ndim2,
+                              const uint64_t *shape3, uint64_t ndim3,
+                              uint64_t *broadcast_shape, uint64_t *padded_shape1,
+                              uint64_t *padded_shape2, uint64_t *padded_shape3,
+                              uint64_t max_rank) {
+    // prepending and initializing
+    std::fill(padded_shape1, padded_shape1 + max_rank, 1);
+    std::fill(padded_shape2, padded_shape2 + max_rank, 1);
+    std::fill(padded_shape3, padded_shape3 + max_rank, 1);
+    std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1);
+    std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2);
+    std::copy(shape3, shape3 + ndim3, padded_shape3 + max_rank - ndim3);
+
+    // compute broadcasted shape
+    for (size_t i = 0; i < max_rank; ++i) {
+        if ((padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) &&
+            (padded_shape1[i] == padded_shape3[i] || padded_shape1[i] == 1 || padded_shape3[i] == 1)) {
+            broadcast_shape[i] = std::max(std::max(padded_shape1[i], padded_shape2[i]), padded_shape3[i]);
+        } else {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
 // check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
 inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
                                   uint64_t broadcast_ndim) {
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,6 @@ __C infiniopStatus_t infiniopCreateClipDescriptor(`
`28`	`28`	`}`
`29`	`29`	`#endif`
`30`	`30`	`}`
`31`		`- std::cout << "Creating Clip Descriptorxx" << std::endl;`
`32`	`31`	`return STATUS_BAD_DEVICE;`
`33`	`32`	`}`
`34`	`33`