Skip to content

Commit a2d8f77

Browse files
committed
blockall_reduce op
1 parent 78b7d86 commit a2d8f77

File tree

16 files changed

+688
-65
lines changed

16 files changed

+688
-65
lines changed

include/ops/reducemax/reducemax.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemaxDescriptor(infiniopHandle_t
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void const *x, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemax(infiniopReducemaxDescriptor_t desc, void *y, void *x, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReducemaxDescriptor(infiniopReducemaxDescriptor_t desc);
2525
#endif

include/ops/reducemean/reducemean.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReducemeanDescriptor(infiniopHandle_
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemean(infiniopReducemeanDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReducemeanDescriptor(infiniopReducemeanDescriptor_t desc);
2525
#endif

include/ops/reducemin/reducemin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ __C __export infiniopStatus_t infiniopCreateReduceminDescriptor(infiniopHandle_t
1919
bool noop_with_empty_axes
2020
);
2121

22-
__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void const *src, void const *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
22+
__C __export infiniopStatus_t infiniopReducemin(infiniopReduceminDescriptor_t desc, void *dst, void *src, void *dynamic_axes, uint64_t dynamic_axes_size, void *stream);
2323

2424
__C __export infiniopStatus_t infiniopDestroyReduceminDescriptor(infiniopReduceminDescriptor_t desc);
2525
#endif

operatorspy/tests/reducemax.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ def test(
113113
c_bool(noop_with_empty_axes),
114114
)
115115
)
116-
print(f"op desctiptor created")
117116
x_tensor.descriptor.contents.invalidate()
118117
y_tensor.descriptor.contents.invalidate()
119118
for i in range(NUM_PRERUN if PROFILE else 1):
@@ -151,32 +150,42 @@ def test(
151150
def test_cpu(lib, test_cases):
152151
device = DeviceEnum.DEVICE_CPU
153152
handle = create_handle(lib, device)
154-
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
155-
print(dynamic_axes)
156-
test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
153+
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
154+
test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
157155
print("\n")
158156
#test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
159157
destroy_handle(lib, handle)
160158

159+
def test_cuda(lib, test_cases):
160+
device = DeviceEnum.DEVICE_CUDA
161+
handle = create_handle(lib, device)
162+
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
163+
test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
164+
print("\n")
165+
destroy_handle(lib, handle)
161166

162167
if __name__ == "__main__":
163168
test_cases = [
164169
# dynamic calc test eg
165-
((2, 3, 4, 5), [0, 2], False, True, None),
166-
((2, 3, 4, 5), [0, 2], False, True, None),
167-
#(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
168-
((2, 10, 24, 10), [0, 2], False, True, None),
169-
# stride =
170-
((2, 10, 24, 10), [0, 1], False, True, None),
171-
((2, 10, 24, 10), [2, 3], False , True, None),
172-
((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
173-
# validate attribute noop_with_empty_axes and keepdims
174-
((2, 10, 24, 10), None, True, True, None),
175-
((2, 10, 24, 10), None, True, False, None),
176-
((2, 10, 24, 10), None, False, True, None),
177-
((2, 10, 24, 10), None, False, False, None),
178-
((2, 3, 4), [0, 1], False, False, None),
170+
# ((2, 3, 4, 5), [0, 2], False, True, None),
171+
# ((2, 3, 4, 5), [0, 2], False, True, None),
172+
# #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
173+
# ((2, 10, 24, 10), [0, 2], False, True, None),
174+
# # stride =
175+
# ((2, 10, 24, 10), [0, 1], False, True, None),
176+
# ((2, 10, 24, 10), [2, 3], False , True, None),
177+
# ((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
178+
# # validate attribute noop_with_empty_axes and keepdims
179+
# ((2, 10, 24, 10), None, True, True, None),
180+
# ((2, 10, 24, 10), None, True, False, None),
181+
# ((2, 10, 24, 10), None, False, True, None),
182+
# ((2, 10, 24, 10), None, False, False, None),
183+
# ((2, 3, 4), [0, 1], False, False, None),
179184
#((2, 10, 24, 10), [], True),
185+
((4,), [0], False, False, None, torch.float32),
186+
((1000, 3), [0, 1], False, False, None, torch.float16),
187+
((50, 3), [0, 1], False, False, None, torch.float32),
188+
((1000, 3), [0, 1], False, False, None, torch.float32),
180189
]
181190
args = get_args()
182191
lib = open_lib()
@@ -202,5 +211,8 @@ def test_cpu(lib, test_cases):
202211
]
203212
lib.infiniopDestroyReducemaxDescriptor.restype = c_int32
204213
lib.infiniopDestroyReducemaxDescriptor.argtypes = [infiniopReducemaxDescriptor_t]
205-
test_cpu(lib, test_cases)
214+
if args.cpu:
215+
test_cpu(lib, test_cases)
216+
if args.cuda:
217+
test_cuda(lib, test_cases)
206218
print("All tests passed!")

operatorspy/tests/reducemean.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def inferShape(x_shape, axis, noop_with_empty_axes, keepdims=False):
4747
return tuple([1] * len(x_shape))
4848
else:
4949
return tuple([])
50+
5051
assert len(axis) <= len(x_shape), "axis out of range"
5152
output_shape = []
5253
axis = [a if a >= 0 else a + len(x_shape) for a in axis] # 更新 axis 列表中的值
@@ -82,9 +83,9 @@ def test(
8283
print(
8384
f"Testing reducemean on {torch_device} with x_shape:{x_shape} dtype:{tensor_dtype}"
8485
)
85-
x = torch.randn(x_shape, dtype=tensor_dtype, device=torch_device)
86+
x = torch.randint(0, 10, x_shape, dtype=tensor_dtype, device=torch_device)
8687
print(f"y_shape = {inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)}")
87-
y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float('-inf'), dtype=tensor_dtype, device=torch_device)
88+
y = torch.full(inferShape(x_shape, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims), float(0), dtype=tensor_dtype, device=torch_device)
8889
print(f"y_shape = {y.shape}")
8990
for i in range(NUM_PRERUN if PROFILE else 1):
9091
ans = reduce_mean(x, axes if dynamic_axes == None else dynamic_axes, noop_with_empty_axes, keepdims)
@@ -141,6 +142,7 @@ def test(
141142
)
142143
elapsed = (time.time() - start_time) / NUM_ITERATIONS
143144
print(f"lib time: {elapsed :10f}")
145+
#print(f"input_data = {x}")
144146
print(f"custom op output:{y}")
145147
print(f"pytorch output:{ans}")
146148
assert torch.allclose(y, ans, atol=0, rtol=1e-3)
@@ -150,30 +152,39 @@ def test(
150152
def test_cpu(lib, test_cases):
151153
device = DeviceEnum.DEVICE_CPU
152154
handle = create_handle(lib, device)
153-
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes in test_cases:
154-
test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=torch.float16)
155+
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
156+
test(lib, handle, "cpu", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
155157
print("\n")
156158
#test(lib, handle, "cpu", x_shape, axes, tensor_dtype=torch.float32)
157159
destroy_handle(lib, handle)
158160

161+
def test_cuda(lib, test_cases):
162+
device = DeviceEnum.DEVICE_CUDA
163+
handle = create_handle(lib, device)
164+
for x_shape, axes, noop_with_empty_axes, keepdims, dynamic_axes, tensor_dtype in test_cases:
165+
test(lib, handle, "cuda", x_shape, axes, dynamic_axes, noop_with_empty_axes, keepdims, tensor_dtype=tensor_dtype)
166+
print("\n")
167+
destroy_handle(lib, handle)
168+
159169

160170
if __name__ == "__main__":
161171
test_cases = [
162172
# dynamic calc test eg
163-
((2, 3, 4, 5), [0, 2], False, True, None),
164-
((2, 3, 4, 5), [0, 2], False, True, None),
165-
#(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
166-
((2, 10, 24, 10), [0, 2], False, True, None),
167-
# stride =
168-
((2, 10, 24, 10), [0, 1], False, True, None),
169-
((2, 10, 24, 10), [2, 3], False , True, None),
170-
((2, 10, 24, 10), [0, 1, 2, 3], False, True, None),
173+
# ((2, 3, 4, 5), [0, 2], False, True, None),
174+
# ((2, 3, 4, 5), [0, 2], False, True, None),
175+
# #(input_shape, axis, noop_with_empty_axes, keepdims, dynamic_axes)
176+
# ((2, 10, 24, 10), [0, 2], False, True, None),
177+
# # stride =
178+
# ((2, 10, 24, 10), [0, 1], False, True, None),
179+
# ((2, 10, 24, 10), [2, 3], False , True, None),
180+
((50, 3), [0, 1], False, False, None, torch.float16),
181+
((1000, 3), [0, 1], False, False, None, torch.float16),
171182
# validate attribute noop_with_empty_axes and keepdims
172-
((2, 10, 24, 10), None, True, True, None),
173-
((2, 10, 24, 10), None, True, False, None),
174-
((2, 10, 24, 10), None, False, True, None),
175-
((2, 10, 24, 10), None, False, False, None),
176-
((2, 3, 4), [0, 1], False, False, None),
183+
# ((2, 10, 24, 10), None, True, True, None),
184+
# ((2, 10, 24, 10), None, True, False, None),
185+
# ((2, 10, 24, 10), None, False, True, None),
186+
# ((2, 10, 24, 10), None, False, False, None),
187+
# ((2, 3, 4), [0, 1], False, False, None),
177188
#((2, 10, 24, 10), [], True),
178189
]
179190
args = get_args()
@@ -200,5 +211,8 @@ def test_cpu(lib, test_cases):
200211
]
201212
lib.infiniopDestroyReducemeanDescriptor.restype = c_int32
202213
lib.infiniopDestroyReducemeanDescriptor.argtypes = [infiniopReducemeanDescriptor_t]
203-
test_cpu(lib, test_cases)
214+
if args.cpu:
215+
test_cpu(lib, test_cases)
216+
if args.cuda:
217+
test_cuda(lib, test_cases)
204218
print("All tests passed!")

src/ops/reduce/cpu/reduce_cpu.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,8 @@ infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc,
214214

215215
infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
216216
void *y,
217-
void const *x,
218-
void const *dynamic_axes,
217+
void *x,
218+
void *dynamic_axes,
219219
uint64_t dynamic_axes_size,
220220
void *stream){
221221
if (desc->is_axes_static == true && dynamic_axes_size > 0){

src/ops/reduce/cpu/reduce_cpu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle,
4444

4545
infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc,
4646
void *y,
47-
void const *x,
48-
void const *dynamic_axes,
47+
void *x,
48+
void *dynamic_axes,
4949
uint64_t dynamic_axes_size,
5050
void *stream);
5151

src/ops/reduce/cuda/reduce_cuda.cc

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include "reduce_cuda.h"
2+
#include "../../../devices/cuda/common_cuda.h"
3+
#include "../../utils.h"
4+
// need reduce_size, output_size, output_stride, input_stride
5+
// generate reduce_mask
6+
infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle,
7+
ReduceCudaDescriptor_t *desc_ptr,
8+
infiniopTensorDescriptor_t y,
9+
infiniopTensorDescriptor_t x,
10+
int64_t const *axes,
11+
uint64_t axes_size,
12+
int reduce_op_type,
13+
bool keepdims
14+
) {
15+
if (x->dt != F16 && x->dt != F32) {
16+
return STATUS_BAD_TENSOR_DTYPE;
17+
}
18+
if (keepdims) {
19+
if (x->ndim != y->ndim) {
20+
return STATUS_BAD_TENSOR_SHAPE;
21+
}
22+
}
23+
if (x->dt != y->dt) {
24+
return STATUS_BAD_TENSOR_DTYPE;
25+
}
26+
if (!is_contiguous(x) || !is_contiguous(y)) {
27+
return STATUS_BAD_TENSOR_STRIDES;
28+
}
29+
uint64_t element_num = 1;
30+
uint64_t output_size = 1;
31+
for (uint64_t i = 0; i < x->ndim; i++) {
32+
element_num *= x->shape[i];
33+
}
34+
35+
for (uint64_t i = 0; i < y->ndim; i++) {
36+
output_size *= y->shape[i];
37+
}
38+
uint64_t reduce_size = element_num / output_size;
39+
uint64_t ndim = y->ndim;
40+
41+
bool *reduce_mask = new bool[x->ndim];
42+
int64_t *input_strides = new int64_t[x->ndim];
43+
int64_t *output_strides = new int64_t[y->ndim];
44+
uint64_t *input_shape = new uint64_t[x->ndim];
45+
uint64_t *output_shape = new uint64_t[y->ndim];
46+
47+
memcpy(input_shape, x->shape, x->ndim * sizeof(uint64_t));
48+
memcpy(output_shape, y->shape, y->ndim * sizeof(uint64_t));
49+
memcpy(input_strides, x->strides, x->ndim * sizeof(int64_t));
50+
memcpy(output_strides, y->strides, y->ndim * sizeof(int64_t));
51+
52+
bool if_reduce_axes_contiguous = true;
53+
int reduce_mode = 0;
54+
for (uint64_t i = 0; i < axes_size; i++) {
55+
reduce_mask[axes[i]] = true;
56+
if (i < axes_size - 1 && axes[i] != axes[i + 1] - 1) {
57+
if_reduce_axes_contiguous = false;
58+
}
59+
}
60+
if (if_reduce_axes_contiguous) {
61+
if (axes_size == x->ndim) {
62+
// all axes are reduced
63+
int reduce_mode = 0;
64+
} else {
65+
// some axes are not reduced but axes are contiguous
66+
if (reduce_size > 1024 && output_size < 128) reduce_mode = 1; // multi-thread for each output element
67+
else reduce_mode = 2; // one thread for each output element
68+
}
69+
} else {
70+
if (reduce_size > 1024 && output_size < 128) reduce_mode = 3;
71+
else reduce_mode = 4;
72+
}
73+
bool *d_reduce_mask = new bool[x->ndim];
74+
int64_t *d_input_strides = new int64_t[x->ndim];
75+
int64_t *d_output_strides = new int64_t[y->ndim];
76+
uint64_t *d_input_shape = new uint64_t[x->ndim];
77+
uint64_t *d_output_shape = new uint64_t[y->ndim];
78+
79+
checkCudaErrorWithCode(cudaMalloc((void**)&d_reduce_mask, x->ndim * sizeof(bool)), STATUS_MEMORY_NOT_ALLOCATED);
80+
checkCudaErrorWithCode(cudaMalloc((void**)&d_input_strides, x->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
81+
checkCudaErrorWithCode(cudaMalloc((void**)&d_output_strides, y->ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED);
82+
checkCudaErrorWithCode(cudaMalloc((void**)&d_input_shape, x->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
83+
checkCudaErrorWithCode(cudaMalloc((void**)&d_output_shape, y->ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED);
84+
85+
checkCudaErrorWithCode(cudaMemcpy(d_reduce_mask, reduce_mask, x->ndim * sizeof(bool), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
86+
checkCudaErrorWithCode(cudaMemcpy(d_input_strides, input_strides, x->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
87+
checkCudaErrorWithCode(cudaMemcpy(d_output_strides, output_strides, y->ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
88+
checkCudaErrorWithCode(cudaMemcpy(d_input_shape, input_shape, x->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
89+
checkCudaErrorWithCode(cudaMemcpy(d_output_shape, output_shape, y->ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED);
90+
91+
*desc_ptr = new ReduceCudaDescriptor{
92+
DevNvGpu,
93+
x->dt,
94+
ndim,
95+
d_reduce_mask,
96+
d_input_strides,
97+
d_output_strides,
98+
d_input_shape,
99+
d_output_shape,
100+
reduce_size,
101+
element_num,
102+
output_size,
103+
static_cast<int>(reduce_op_type),
104+
reduce_mode
105+
};
106+
delete [] reduce_mask;
107+
delete [] input_strides;
108+
delete [] output_strides;
109+
delete [] input_shape;
110+
delete [] output_shape;
111+
return STATUS_SUCCESS;
112+
}
113+
114+
infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) {
115+
checkCudaErrorWithCode(cudaFree((void*)desc->reduce_mask), STATUS_EXECUTION_FAILED);
116+
checkCudaErrorWithCode(cudaFree((void*)desc->input_strides), STATUS_EXECUTION_FAILED);
117+
checkCudaErrorWithCode(cudaFree((void*)desc->output_strides), STATUS_EXECUTION_FAILED);
118+
checkCudaErrorWithCode(cudaFree((void*)desc->input_shape), STATUS_EXECUTION_FAILED);
119+
checkCudaErrorWithCode(cudaFree((void*)desc->output_shape), STATUS_EXECUTION_FAILED);
120+
delete desc;
121+
return STATUS_SUCCESS;
122+
}

0 commit comments

Comments
 (0)