diff --git a/include/fused_kernel/algorithms/basic_ops/algebraic.h b/include/fused_kernel/algorithms/basic_ops/algebraic.h index 3b192751..2fce9834 100644 --- a/include/fused_kernel/algorithms/basic_ops/algebraic.h +++ b/include/fused_kernel/algorithms/basic_ops/algebraic.h @@ -38,7 +38,7 @@ namespace fk { using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { const float3 xOut = input * params.x; const float3 yOut = input * params.y; const float3 zOut = input * params.z; @@ -54,7 +54,7 @@ namespace fk { FK_STATIC_STRUCT(MxVFloat3, SelfType) using Parent = UnaryOperation, float3, MxVFloat3>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { const float3 xOut = get<0>(input) * get<1>(input).x; const float3 yOut = get<0>(input) * get<1>(input).y; const float3 zOut = get<0>(input) * get<1>(input).z; diff --git a/include/fused_kernel/algorithms/basic_ops/arithmetic.h b/include/fused_kernel/algorithms/basic_ops/arithmetic.h index 02de435b..a07a0d04 100644 --- a/include/fused_kernel/algorithms/basic_ops/arithmetic.h +++ b/include/fused_kernel/algorithms/basic_ops/arithmetic.h @@ -30,7 +30,7 @@ namespace fk { FK_STATIC_STRUCT(Add, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input + params; } }; @@ -43,7 +43,7 @@ namespace fk { FK_STATIC_STRUCT(Add, SelfType) using Parent = UnaryOperation, O, Add>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return get<0>(input) + get<1>(input); } }; @@ -56,7 +56,7 @@ namespace fk { FK_STATIC_STRUCT(Sub, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input - params; } }; @@ -69,7 +69,7 @@ namespace fk { FK_STATIC_STRUCT(Mul, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input * params; } }; @@ -82,7 +82,7 @@ namespace fk { FK_STATIC_STRUCT(Div, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input / params; } }; diff --git a/include/fused_kernel/algorithms/basic_ops/cast.h b/include/fused_kernel/algorithms/basic_ops/cast.h index aaf76099..1627104d 100644 --- a/include/fused_kernel/algorithms/basic_ops/cast.h +++ b/include/fused_kernel/algorithms/basic_ops/cast.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(Cast, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::cast::f(input); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/logical.h b/include/fused_kernel/algorithms/basic_ops/logical.h index 8376704e..a6e07d83 100644 --- a/include/fused_kernel/algorithms/basic_ops/logical.h +++ b/include/fused_kernel/algorithms/basic_ops/logical.h @@ -29,7 +29,7 @@ namespace fk { FK_STATIC_STRUCT(IsEven, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE auto exec(const InputType& input) { + FK_HOST_DEVICE_FUSE auto exec(const InputType input) { return cxp::is_even::f(input); } }; @@ -42,7 +42,7 @@ namespace fk { FK_STATIC_STRUCT(Max, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return cxp::max::f(input, params); } }; @@ -55,7 +55,7 @@ namespace fk { FK_STATIC_STRUCT(Max, SelfType) using Parent = UnaryOperation, O, Max>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::max::f(get<0>(input), get<1>(input)); } }; @@ -68,7 +68,7 @@ namespace fk { FK_STATIC_STRUCT(Min, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return cxp::min::f(input, params); } }; @@ -81,7 +81,7 @@ namespace fk { FK_STATIC_STRUCT(Min, SelfType) using Parent = UnaryOperation, O, Min>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::min::f(get<0>(input), get<1>(input)); } }; @@ -95,21 +95,21 @@ namespace fk { using Parent = UnaryOperation, bool, Equal>; DECLARE_UNARY_PARENT template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { return get<0>(input) == get<1>(input); } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y; } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y && result.z; } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y && result.z && result.w; } diff --git a/include/fused_kernel/algorithms/basic_ops/memory_operations.h b/include/fused_kernel/algorithms/basic_ops/memory_operations.h index 844e52ca..91184730 100644 --- a/include/fused_kernel/algorithms/basic_ops/memory_operations.h +++ b/include/fused_kernel/algorithms/basic_ops/memory_operations.h @@ -32,16 +32,16 @@ namespace fk { FK_STATIC_STRUCT(PerThreadRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) -> ThreadFusionType { return *PtrAccessor::template cr_point>(thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { if constexpr (D == ND::_1D) { return 1; } else { @@ -49,7 +49,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { if constexpr (D == ND::_1D || D == ND::_2D) { return 1; } else { @@ -57,7 +57,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -90,15 +90,15 @@ namespace fk { FK_STATIC_STRUCT(PerThreadWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { *PtrAccessor::template point>(thread, params) = input; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } FK_HOST_FUSE InstantiableType build(const Ptr& ptr) { @@ -131,23 +131,23 @@ namespace fk { DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) -> ThreadFusionType { return *PtrAccessor::template cr_point>(thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -165,15 +165,15 @@ namespace fk { FK_STATIC_STRUCT(TensorWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const ThreadFusionType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ThreadFusionType input, const ParamsType& params) { *PtrAccessor::template point>(thread, params) = input; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -186,7 +186,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorSplit, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const T& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor write. It must be one of 2, 3 or 4."); @@ -203,10 +203,10 @@ namespace fk { *(work_plane + (planePixels * 3)) = input.w; } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -219,7 +219,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorTSplit, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor write. It must be one of 2, 3 or 4."); @@ -232,10 +232,10 @@ namespace fk { *PtrAccessor::point(thread, params, 3) = input.w; } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -248,7 +248,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorPack, SelfType) DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor read. It must be one of 2, 3 or 4."); @@ -268,19 +268,19 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -297,7 +297,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorTPack, SelfType) DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor read. It must be one of 2, 3 or 4."); @@ -316,16 +316,16 @@ namespace fk { return make_(x, y, z, w); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -365,7 +365,7 @@ namespace fk { public: FK_STATIC_STRUCT(SplitWrite, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split write. It must be one of 2, 3 or 4."); *PtrAccessor::point(thread, params.x) = input.x; @@ -373,10 +373,10 @@ namespace fk { if constexpr (cn >= 3) *PtrAccessor::point(thread, params.z) = input.z; if constexpr (cn == 4) *PtrAccessor::point(thread, params.w) = input.w; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.x.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.x.dims.pitch; } @@ -420,7 +420,7 @@ namespace fk { namespace circular_batch_internal { template - FK_HOST_DEVICE_CNST Point computeCircularThreadIdx(const Point& currentIdx, const int& fst) { + FK_HOST_DEVICE_CNST Point computeCircularThreadIdx(const Point currentIdx, const int fst) { if constexpr (direction == CircularDirection::Ascendent) { const int z = currentIdx.z + fst; return { currentIdx.x, currentIdx.y, z >= BATCH ? z - BATCH : z }; @@ -444,7 +444,7 @@ namespace fk { FK_STATIC_STRUCT(CircularBatchRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point thread, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { return Operation::template exec(newThreadIdx, params.opData[newThreadIdx.z]); @@ -452,19 +452,19 @@ namespace fk { return Operation::exec(newThreadIdx, params.opData[newThreadIdx.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } @@ -486,7 +486,7 @@ namespace fk { FK_STATIC_STRUCT(CircularBatchWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const ThreadFusionType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ThreadFusionType input, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { Operation::template exec(newThreadIdx, input, params.opData[newThreadIdx.z]); @@ -494,10 +494,10 @@ namespace fk { Operation::exec(newThreadIdx, input, params.opData[newThreadIdx.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opBatch) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opBatch) { return Operation::num_elems_x(thread, opBatch.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opBatch) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opBatch) { return Operation::pitch(thread, opBatch.params.opData[thread.z]); } }; @@ -515,7 +515,7 @@ namespace fk { FK_STATIC_STRUCT(CircularTensorRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point thread, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { return Operation::template exec(newThreadIdx, params.opData); @@ -523,19 +523,19 @@ namespace fk { return Operation::exec(newThreadIdx, params.opData); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData); } @@ -557,8 +557,8 @@ namespace fk { FK_STATIC_STRUCT(CircularTensorWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { @@ -567,10 +567,10 @@ namespace fk { Operation::exec(newThreadIdx, input, params.opData); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/set.h b/include/fused_kernel/algorithms/basic_ops/set.h index 40c86131..5a0c9a5b 100644 --- a/include/fused_kernel/algorithms/basic_ops/set.h +++ b/include/fused_kernel/algorithms/basic_ops/set.h @@ -34,19 +34,19 @@ namespace fk { FK_STATIC_STRUCT(ReadSet, SelfType) using Parent = ReadOperation, T, TF::DISABLED, ReadSet>; DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { return params.value; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.size.x; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.size.y; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.size.z; } diff --git a/include/fused_kernel/algorithms/basic_ops/static_loop.h b/include/fused_kernel/algorithms/basic_ops/static_loop.h index 321ead53..88c7f783 100644 --- a/include/fused_kernel/algorithms/basic_ops/static_loop.h +++ b/include/fused_kernel/algorithms/basic_ops/static_loop.h @@ -29,7 +29,7 @@ namespace fk { private: template - FK_DEVICE_FUSE OutputType helper_exec(const InputType& input, const ParamsType& params) { + FK_DEVICE_FUSE OutputType helper_exec(const InputType input, const ParamsType& params) { if constexpr (ITERATION + 1 < ITERATIONS) { return helper_exec(Operation::exec(input, params), params); } else { @@ -38,7 +38,7 @@ namespace fk { } public: - FK_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return helper_exec<0>(Operation::exec(input, params), params); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/vector_ops.h b/include/fused_kernel/algorithms/basic_ops/vector_ops.h index b3ccd25e..f4c97584 100644 --- a/include/fused_kernel/algorithms/basic_ops/vector_ops.h +++ b/include/fused_kernel/algorithms/basic_ops/vector_ops.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(Discard, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(cn > cn, "Output type should at least have one channel less"); static_assert(std::is_same_v, VBase>, "Base types should be the same"); @@ -48,7 +48,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReorder, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(validCUDAVec, "Non valid CUDA vetor type: UnaryVectorReorder"); static_assert(cn >= 2, "Minimum number of channels is 2: UnaryVectorReorder"); return {static_get(input)...}; @@ -63,7 +63,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReorderRT, SelfType) using Parent = BinaryOperation>, T, VectorReorderRT>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(validCUDAVec, "Non valid CUDA vetor type"); static_assert(cn >= 2, "Minimum number of channels is 2"); if constexpr (cn == 2) { @@ -86,7 +86,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReduce, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::vector_reduce::f(input); } }; @@ -99,7 +99,7 @@ namespace fk { FK_STATIC_STRUCT(AddLast, SelfType) using Parent = BinaryOperation::base, O, AddLast>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(cn == cn -1, "Output type should have one channel more"); static_assert(std::is_same_v::base, typename VectorTraits::base>, "Base types should be the same"); @@ -125,7 +125,7 @@ namespace fk { FK_STATIC_STRUCT(VectorAnd, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::vector_and::f(input); } }; diff --git a/include/fused_kernel/algorithms/image_processing/border_reader.h b/include/fused_kernel/algorithms/image_processing/border_reader.h index e8797f6f..a681136f 100644 --- a/include/fused_kernel/algorithms/image_processing/border_reader.h +++ b/include/fused_kernel/algorithms/image_processing/border_reader.h @@ -56,15 +56,15 @@ namespace fk { using Parent = IncompleteReadBackOperation; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -93,15 +93,15 @@ namespace fk { using Parent = IncompleteReadBackOperation, NullType, NullType, SelfType>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -127,15 +127,15 @@ namespace fk { using Parent = IncompleteReadBackOperation, NullType, NullType, SelfType>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -165,18 +165,18 @@ public: \ using Parent = ReadBackOperation, \ BackIOp_, typename BackIOp_::Operation::OutputType, SelfType>; \ DECLARE_READBACK_PARENT \ -FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_x(thread, opData.backIOp); \ } \ -FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_y(thread, opData.backIOp); \ } \ -FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_z(thread, opData.backIOp); \ } #define BORDER_READER_EXEC \ -FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { \ +FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { \ const int last_col = BackIOp::Operation::num_elems_x(thread, backIOp) - 1; \ const int last_row = BackIOp::Operation::num_elems_y(thread, backIOp) - 1; \ const Point new_thread{idx_col(thread.x, last_col), idx_row(thread.y, last_row), thread.z}; \ @@ -197,17 +197,17 @@ FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& param BackIOp_, ReadAndOutputType, SelfType>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_z(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const int width = BackIOp::Operation::num_elems_x(thread, backIOp); const int height = BackIOp::Operation::num_elems_y(thread, backIOp); if (thread.x >= 0 && thread.x < width && thread.y >= 0 && thread.y < height) { @@ -274,7 +274,7 @@ FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& param struct BorderReader, BackIOp_, std::enable_if_t, void>> { BORDER_READER_DETAILS(BorderType::WRAP) - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const int width = BackIOp::Operation::num_elems_x(thread, backIOp); const int height = BackIOp::Operation::num_elems_y(thread, backIOp); const Point new_thread{idx_col(thread.x, width), idx_row(thread.y, height), thread.z}; diff --git a/include/fused_kernel/algorithms/image_processing/color_conversion.h b/include/fused_kernel/algorithms/image_processing/color_conversion.h index 7eaaf6cb..3f667278 100644 --- a/include/fused_kernel/algorithms/image_processing/color_conversion.h +++ b/include/fused_kernel/algorithms/image_processing/color_conversion.h @@ -34,7 +34,7 @@ namespace fk { FK_STATIC_STRUCT(StaticAddAlpha, SelfType) using Parent = UnaryOperation, StaticAddAlpha>; DECLARE_UNARY_PARENT - FK_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_DEVICE_FUSE OutputType exec(const InputType input) { return AddLast::exec(input, { alpha }); } }; @@ -52,7 +52,7 @@ namespace fk { FK_STATIC_STRUCT(RGB2Gray, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { // 0.299*R + 0.587*G + 0.114*B if constexpr (std::is_unsigned_v) { #ifdef __CUDA_ARCH__ @@ -71,7 +71,7 @@ namespace fk { } } private: - FK_HOST_DEVICE_FUSE float compute_luminance(const InputType& input) { + FK_HOST_DEVICE_FUSE float compute_luminance(const InputType input) { return (input.x * 0.299f) + (input.y * 0.587f) + (input.z * 0.114f); } }; @@ -110,7 +110,7 @@ namespace fk { FK_STATIC_STRUCT(AddOpaqueAlpha, SelfType) using Parent = UnaryOperation, AddOpaqueAlpha>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { constexpr auto alpha = maxDepthValue; return AddLast::exec(input, { alpha }); } @@ -124,7 +124,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateDepth, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return Saturate::exec(input, { { 0.f, static_cast(maxDepthValue) } }); } }; @@ -192,7 +192,7 @@ namespace fk { FK_STATIC_STRUCT(DenormalizePixel, SelfType) using Parent = UnaryOperation>, O, DenormalizePixel>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { constexpr auto maxDepth = maxDepthValue; return cxp::cast::f(input * maxDepth); } @@ -206,7 +206,7 @@ namespace fk { FK_STATIC_STRUCT(NormalizePixel, SelfType) using Parent = UnaryOperation>, NormalizePixel>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return input / static_cast(maxDepthValue); } }; @@ -219,7 +219,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateDenormalizePixel, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_same_v, float>, "SaturateDenormalizePixel only works with float base types."); const InputType saturatedFloat = SaturateFloat::exec(input); return DenormalizePixel::exec(saturatedFloat); @@ -235,7 +235,7 @@ namespace fk { using Parent = UnaryOperation>; DECLARE_UNARY_PARENT using Base = typename VectorTraits::base; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_floating_point_v>, "NormalizeColorRangeDepth only works for floating point values"); // The nvcc compiler will only be able to use the global constexpr floatShiftFactor variable if it is stored in // a local variable. @@ -262,7 +262,7 @@ namespace fk { // Y -> input.x // Cb(U) -> input.y // Cr(V) -> input.z - FK_HOST_DEVICE_FUSE float3 computeRGB(const InputType& pixel) { + FK_HOST_DEVICE_FUSE float3 computeRGB(const InputType pixel) { constexpr M3x3Float coefficients = ccMatrix; constexpr float CSub = subCoefficients.chroma; if constexpr (CP == ColorPrimitives::bt601) { @@ -273,7 +273,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType computePixel(const InputType& pixel) { + FK_HOST_DEVICE_FUSE OutputType computePixel(const InputType pixel) { const float3 pixelRGBFloat = computeRGB(pixel); if constexpr (std::is_same_v, float>) { if constexpr (ALPHA) { @@ -293,7 +293,7 @@ namespace fk { } public: - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { // Pixel data shifted to the right to it's color depth numerical range constexpr auto shiftFactorLocal = shiftFactor; const InputType shiftedPixel = input >> shiftFactorLocal; @@ -326,7 +326,7 @@ namespace fk { TF::DISABLED, ReadYUV>; DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { const auto rawPtr = params.data; if constexpr (PF == PixelFormat::NV12 || PF == PixelFormat::P010 || PF == PixelFormat::P016 || PF == PixelFormat::P210 || @@ -382,15 +382,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -416,7 +416,7 @@ namespace fk { TF::DISABLED, WriteYUV>; DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { const auto rawPtr = params.data; if constexpr (PF == PixelFormat::NV12 || PF == PixelFormat::P010 || PF == PixelFormat::P016 || PF == PixelFormat::P210 || @@ -494,15 +494,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/crop.h b/include/fused_kernel/algorithms/image_processing/crop.h index 5f1e66ef..76cf788f 100644 --- a/include/fused_kernel/algorithms/image_processing/crop.h +++ b/include/fused_kernel/algorithms/image_processing/crop.h @@ -33,20 +33,20 @@ namespace fk { typename BackIOp_::Operation::OutputType, Crop>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const Point newThread{thread.x + static_cast(params.x), thread.y + static_cast(params.y)}; return BackIOp::Operation::exec(newThread, backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -68,15 +68,15 @@ namespace fk { using Parent = IncompleteReadBackOperation>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/deinterlace.h b/include/fused_kernel/algorithms/image_processing/deinterlace.h index 3694c17b..fa1bb7a0 100644 --- a/include/fused_kernel/algorithms/image_processing/deinterlace.h +++ b/include/fused_kernel/algorithms/image_processing/deinterlace.h @@ -51,7 +51,7 @@ namespace fk { Deinterlace>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { if constexpr (DType == DeinterlaceType::BLEND) { return execBlend(thread, params, backIOp); } else { // INTER_LINEAR @@ -59,15 +59,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -75,7 +75,7 @@ namespace fk { return { num_elems_x(Point{0,0,0}, opData), num_elems_y(Point{0,0,0}, opData), num_elems_z(Point{0,0,0}, opData) }; } private: - FK_HOST_DEVICE_FUSE OutputType execBlend(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType execBlend(const Point thread, const ParamsType& params, const BackIOp& backIOp) { // For blend deinterlacing, we average the current line with adjacent lines using ReadOperation = typename BackIOp::Operation; @@ -90,7 +90,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType execInterLinearGetPixel(const Point& thread, const BackIOp& backIOp, const bool& interpolate) { + FK_HOST_DEVICE_FUSE OutputType execInterLinearGetPixel(const Point thread, const BackIOp& backIOp, const bool& interpolate) { using ReadOperation = typename BackIOp::Operation; if (interpolate) { // We average the above pixel with the below pixel @@ -102,7 +102,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType execInterLinear(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType execInterLinear(const Point thread, const ParamsType& params, const BackIOp& backIOp) { using ReadOperation = typename BackIOp::Operation; // Assuming BackFunction::Operation::num_elems_y(Point{0,0,0}, backIOp) is an even number @@ -129,15 +129,15 @@ namespace fk { template using NewInstantiableType = ReadBack>; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/image.h b/include/fused_kernel/algorithms/image_processing/image.h index f5e11372..90cb1c7b 100644 --- a/include/fused_kernel/algorithms/image_processing/image.h +++ b/include/fused_kernel/algorithms/image_processing/image.h @@ -58,7 +58,7 @@ namespace fk { return ptr(); } - FK_HOST_CNST Image crop(const Point& p, const uint& newWidth, const uint& newHeight) { + FK_HOST_CNST Image crop(const Point p, const uint& newWidth, const uint& newHeight) { const uint newDataWidth = newWidth * PixelFormatTraits::rf.width_f; const uint newDataHeight = newHeight * PixelFormatTraits::rf.height_f; const Point dataPoint{p.x * PixelFormatTraits::rf.width_f, p.y * PixelFormatTraits::rf.height_f, 0}; @@ -87,7 +87,7 @@ namespace fk { #endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) #endif // defined(NVRTC_COMPILER) - FK_HOST_CNST VectorType_t::cn> readAt(const Point& p) const { + FK_HOST_CNST VectorType_t::cn> readAt(const Point p) const { return ReadYUV::exec(p, ptr()); } }; diff --git a/include/fused_kernel/algorithms/image_processing/interpolation.h b/include/fused_kernel/algorithms/image_processing/interpolation.h index b195fe09..edb76ff3 100644 --- a/include/fused_kernel/algorithms/image_processing/interpolation.h +++ b/include/fused_kernel/algorithms/image_processing/interpolation.h @@ -55,19 +55,19 @@ namespace fk { SelfType>; DECLARE_TERNARY_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params, const BackIOp& backIOp) { const float src_x = input.x; const float src_y = input.y; diff --git a/include/fused_kernel/algorithms/image_processing/resize.h b/include/fused_kernel/algorithms/image_processing/resize.h index f76cd047..b760e980 100644 --- a/include/fused_kernel/algorithms/image_processing/resize.h +++ b/include/fused_kernel/algorithms/image_processing/resize.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(ComputeResizePoint, ComputeResizePoint) using Parent = BinaryOperation; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType thread, const ParamsType& params) { // This is what makes the interpolation a resize operation const float fx = params.x; const float fy = params.y; @@ -83,7 +83,7 @@ namespace fk { SelfType>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { if constexpr (AR == AspectRatio::IGNORE_AR) { return exec_resize(thread, params, backIOp); } else { @@ -97,15 +97,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -114,7 +114,7 @@ namespace fk { } private: - FK_HOST_DEVICE_FUSE auto exec_resize(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE auto exec_resize(const Point thread, const ParamsType& params, const BackIOp& backIOp) { static_assert(opIs, "BackIOp must be a ternary type for this specialization"); const float src_x = thread.x * params.src_conv_factors.x; @@ -138,13 +138,13 @@ namespace fk { template using NewInstantiableType = ResizeComplete>>; - FK_HOST_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -249,13 +249,13 @@ namespace fk { template using NewInstantiableType = ResizeComplete>>; - FK_HOST_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/saturate.h b/include/fused_kernel/algorithms/image_processing/saturate.h index 5fa1b0e8..c8097e62 100644 --- a/include/fused_kernel/algorithms/image_processing/saturate.h +++ b/include/fused_kernel/algorithms/image_processing/saturate.h @@ -30,7 +30,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateCast, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::saturate_cast::f(input); } }; @@ -43,7 +43,7 @@ namespace fk { FK_STATIC_STRUCT(Saturate, SelfType) using Parent = BinaryOperation, 2>, T, Saturate>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(!validCUDAVec, "Saturate only works with non cuda vector types"); return cxp::max::f(cxp::min::f(input, params.y), params.x); } @@ -57,7 +57,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateFloat, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_same_v, float>, "Saturate float only works with float base types."); return cxp::max::f(make_set(0.f), cxp::min::f(input, make_set(1.f)));; diff --git a/include/fused_kernel/algorithms/image_processing/warping.h b/include/fused_kernel/algorithms/image_processing/warping.h index 5b206e55..2da8cdf2 100644 --- a/include/fused_kernel/algorithms/image_processing/warping.h +++ b/include/fused_kernel/algorithms/image_processing/warping.h @@ -47,7 +47,7 @@ namespace fk { FK_STATIC_STRUCT(WarpingCoords, SelfType) using Parent = BinaryOperation, float2, WarpingCoords>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType thread, const ParamsType& params) { const int x = thread.x; const int y = thread.y; const auto& transMatRaw = params.transformMatrix.data; @@ -80,7 +80,7 @@ namespace fk { float_>, Warping>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const float2 coord = WarpingCoords::exec(thread, params); const Size sourceSize(BackIOp::Operation::num_elems_x(thread, backIOp), BackIOp::Operation::num_elems_y(thread, backIOp)); @@ -91,15 +91,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -121,15 +121,15 @@ namespace fk { Warping>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/core/data/ptr_nd.h b/include/fused_kernel/core/data/ptr_nd.h index bedbe42f..b3a70138 100644 --- a/include/fused_kernel/core/data/ptr_nd.h +++ b/include/fused_kernel/core/data/ptr_nd.h @@ -439,7 +439,7 @@ namespace fk { inline constexpr operator RawPtr() const { return ptr_a; } - inline constexpr Ptr crop(const Point& p, const PtrDims& newDims) { + inline constexpr Ptr crop(const Point p, const PtrDims& newDims) { T* ptr = At::point(p, ptr_a); if (ref) { ref->cnt.fetch_add(1); @@ -564,7 +564,7 @@ namespace fk { inline void download(Stream& stream) {} #endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) - inline T at(const Point& p) const { + inline T at(const Point p) const { if (type != MemType::Device) { return *At::cr_point(p, ptr_pinned); } else { @@ -649,7 +649,7 @@ namespace fk { inline constexpr Ptr1D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr Ptr1D crop1D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr Ptr1D crop1D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; template @@ -666,7 +666,7 @@ namespace fk { inline Ptr2D(T* data_, const uint& width_, const uint& height_, const uint& pitch_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, PtrDims(width_, height_, pitch_), type_, deviceID_) {} - inline Ptr2D crop2D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline Ptr2D crop2D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } inline void Alloc(const fk::Size& size, const uint& pitch_ = 0, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) { this->freePtr(); this->allocPtr(PtrDims(size.width, size.height, pitch_), type_, deviceID_); @@ -685,7 +685,7 @@ namespace fk { inline constexpr Ptr3D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr Ptr3D crop3D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr Ptr3D crop3D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; // A color-plane-transposed 3D pointer PtrT3D @@ -700,7 +700,7 @@ namespace fk { inline constexpr PtrT3D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr PtrT3D crop3D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr PtrT3D crop3D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; // A Tensor pointer diff --git a/include/fused_kernel/core/data/rawptr.h b/include/fused_kernel/core/data/rawptr.h index cc05159b..254d52c0 100644 --- a/include/fused_kernel/core/data/rawptr.h +++ b/include/fused_kernel/core/data/rawptr.h @@ -189,12 +189,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return ((const BiggerType*)ptr.data) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)ptr.data + p.x; } }; @@ -202,12 +202,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return (const BiggerType*)((const char*)ptr.data + (p.y * ptr.dims.pitch)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)((char*)ptr.data + (p.y * ptr.dims.pitch)) + p.x; } }; @@ -215,12 +215,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return (const BiggerType*)((const char*)ptr.data + (ptr.dims.plane_pitch * ptr.dims.color_planes * p.z) + (p.y * ptr.dims.pitch)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)((char*)ptr.data + (ptr.dims.plane_pitch * ptr.dims.color_planes * p.z) + (p.y * ptr.dims.pitch)) + p.x; } }; @@ -228,12 +228,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr, const uint& color_plane = 0) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr, const uint color_plane = 0) { return (const BiggerType*)((const char*)ptr.data + (color_plane * ptr.dims.color_planes_pitch) + (ptr.dims.plane_pitch * p.z) + (ptr.dims.pitch * p.y)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr, const uint& color_plane = 0) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr, const uint color_plane = 0) { return (BiggerType*)((char*)ptr.data + (color_plane * ptr.dims.color_planes_pitch) + (ptr.dims.plane_pitch * p.z) + (ptr.dims.pitch * p.y)) + p.x; } }; @@ -244,12 +244,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.x] = value; } }; @@ -257,12 +257,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.y][p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.y][p.x] = value; } }; @@ -270,12 +270,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.z][p.y][p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.z][p.y][p.x] = value; } }; diff --git a/include/fused_kernel/core/data/rect.h b/include/fused_kernel/core/data/rect.h index 09083c0b..5e361a1c 100644 --- a/include/fused_kernel/core/data/rect.h +++ b/include/fused_kernel/core/data/rect.h @@ -24,8 +24,8 @@ namespace fk { struct Rect_ { P x{ 0 }, y{0}; S width{ 0 }, height{0}; - FK_HOST_DEVICE_CNST Rect_(const Point& point, const Size& size) : x(point.x), y(point.y), width(size.width), height(size.height) {} - FK_HOST_DEVICE_CNST Rect_(const P& x_, const P& y_, const S& width_, const S& height_) : x(x_), y(y_), width(width_), height(height_) {} + FK_HOST_DEVICE_CNST Rect_(const Point point, const Size size) : x(point.x), y(point.y), width(size.width), height(size.height) {} + FK_HOST_DEVICE_CNST Rect_(const P x_, const P y_, const S width_, const S height_) : x(x_), y(y_), width(width_), height(height_) {} FK_HOST_DEVICE_CNST Rect_(){} }; diff --git a/include/fused_kernel/core/data/tuple.h b/include/fused_kernel/core/data/tuple.h index 0fe35484..53df2b9b 100644 --- a/include/fused_kernel/core/data/tuple.h +++ b/include/fused_kernel/core/data/tuple.h @@ -251,24 +251,13 @@ namespace fk { } }; - template - FK_HOST_DEVICE_CNST const auto& get(const Tuple& tuple) { - return TupleUtil::get(tuple); - } - - template - FK_HOST_DEVICE_CNST auto& get(Tuple& tuple) { - return TupleUtil::get(tuple); - } - - template - FK_HOST_DEVICE_CNST auto&& get(Tuple&& tuple) { - return TupleUtil::get(std::forward>(tuple)); - } + template + FK_HOST_DEVICE_CNST decltype(auto) get(TupleLike&& tuple) { + static_assert(isTuple_v, "fk::get can only be used with fk::Tuple"); - template - FK_HOST_DEVICE_CNST const auto&& get(const Tuple&& tuple) { - return TupleUtil::get(std::move(tuple)); + // decltype(auto) + std::forward preserves EXACTLY what TupleUtil returned + // (Value category, const-ness, and reference type) + return TupleUtil::get(std::forward(tuple)); } template diff --git a/include/fused_kernel/core/execution_model/data_parallel_patterns.h b/include/fused_kernel/core/execution_model/data_parallel_patterns.h index d80122ec..86bba805 100644 --- a/include/fused_kernel/core/execution_model/data_parallel_patterns.h +++ b/include/fused_kernel/core/execution_model/data_parallel_patterns.h @@ -72,24 +72,24 @@ namespace fk { // namespace FusedKernel private: using Details = DPPDetails; - template - FK_HOST_DEVICE_FUSE auto operate(const Point& thread, const T& i_data, const IOpTypes&... iOpInstances) { + template + FK_HOST_DEVICE_FUSE auto operate(const Point thread, const InputType i_data, const IOpTypes&... iOpInstances) { return (InputFoldType(thread, i_data) | ... | iOpInstances).input; } template - FK_HOST_DEVICE_FUSE auto operate_idx(const Point& thread, const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_idx(const Point thread, const InputType input, const IOpTypes&... instantiableOperationInstances) { return operate(thread, TFI::template get(input), instantiableOperationInstances...); } template - FK_HOST_DEVICE_FUSE auto operate_thread_fusion_impl(std::integer_sequence idx, const Point& thread, - const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_thread_fusion_impl(std::integer_sequence idx, const Point thread, + const InputType input, const IOpTypes&... instantiableOperationInstances) { return TFI::make(operate_idx(thread, input, instantiableOperationInstances...)...); } template - FK_HOST_DEVICE_FUSE auto operate_thread_fusion(const Point& thread, const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_thread_fusion(const Point thread, const InputType input, const IOpTypes&... instantiableOperationInstances) { if constexpr (TFI::elems_per_thread == 1) { return operate(thread, input, instantiableOperationInstances...); } else { @@ -98,7 +98,7 @@ namespace fk { // namespace FusedKernel } // We pass TFI as a template parameter because sometimes we need to disable the TF template - FK_HOST_DEVICE_FUSE auto read(const Point& thread, const ReadIOp& readDF) { + FK_HOST_DEVICE_FUSE auto read(const Point thread, const ReadIOp& readDF) { if constexpr (TFI::ENABLED) { static_assert(isAnyReadType, "ReadIOp is not ReadType or ReadBackType"); return ReadIOp::Operation::template exec(thread, readDF); @@ -109,7 +109,7 @@ namespace fk { // namespace FusedKernel template FK_HOST_DEVICE_FUSE - void execute_instantiable_operations_helper(const Point& thread, const ReadIOp& readDF, + void execute_instantiable_operations_helper(const Point thread, const ReadIOp& readDF, const IOps&... iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; @@ -136,12 +136,12 @@ namespace fk { // namespace FusedKernel } template - FK_HOST_DEVICE_FUSE void execute_instantiable_operations(const Point& thread, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_instantiable_operations(const Point thread, const IOps&... iOps) { execute_instantiable_operations_helper(thread, iOps...); } template - FK_HOST_DEVICE_FUSE void execute_thread(const Point& thread, const ActiveThreads& activeThreads, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ActiveThreads& activeThreads, const IOps&... iOps) { using TFI = typename Details::TFI; if constexpr (!TFI::ENABLED) { execute_instantiable_operations(thread, iOps...); @@ -201,27 +201,6 @@ namespace fk { // namespace FusedKernel return Details{}; } } - template - FK_DEVICE_FUSE auto build_details(const ActiveThreads& activeThreads, const uint& readRow, const uint& writeRow) { - using Details = TransformDPPDetails(TFEN), FirstIOp, IOps...>; - using TFI = typename Details::TFI; - if constexpr (TFI::ENABLED) { - const ActiveThreads gridActiveThreads(static_cast(ceil(activeThreads.x / static_cast(TFI::elems_per_thread))), - activeThreads.y, activeThreads.z); - bool threadDivisible; - if constexpr (TFI::ENABLED) { - using ReadOperation = typename FirstIOp::Operation; - using WriteOperation = typename LastType_t::Operation; - threadDivisible = (readRow % TFI::elems_per_thread == 0) && (writeRow % TFI::elems_per_thread == 0); - } else { - threadDivisible = true; - } - const Details details{ gridActiveThreads, threadDivisible }; - return details; - } else { - return Details{}; - } - } }; // Note: there are no ParArch::GPU_NVIDIA_JIT DPP implementaitons, because @@ -310,8 +289,9 @@ namespace fk { // namespace FusedKernel } template - FK_HOST_DEVICE_FUSE void divergent_operate(const uint& z, const InstantiableOperationSequence& iOpSequence, - const IOpSequenceTypes&... iOpSequences) { + FK_HOST_DEVICE_FUSE void divergent_operate(const uint z, + const InstantiableOperationSequence& iOpSequence, + const IOpSequenceTypes&... iOpSequences) { if (OpSequenceNumber == SequenceSelector::at(z)) { apply(launchTransformDPP, iOpSequence.iOps); } else if constexpr (sizeof...(iOpSequences) > 0) { diff --git a/include/fused_kernel/core/execution_model/operation_model/batch_operations.h b/include/fused_kernel/core/execution_model/operation_model/batch_operations.h index 8423fbba..e6ff8cb8 100644 --- a/include/fused_kernel/core/execution_model/operation_model/batch_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/batch_operations.h @@ -51,8 +51,8 @@ ReadBackOperation>: ReadBackOperation: Instantiable - Implements exec(const Point& thread, const OperationData>& opData) - Implements exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) + Implements exec(const Point thread, const OperationData>& opData) + Implements exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) Implements build(const OperationData>& opData) Implements build(const ParamsType& params, const BackIOp& backIOp) @@ -156,8 +156,8 @@ namespace fk { BatchRead: Instantiable ParamsType = BatchReadParams - Implements exec(const Point& thread, const OperationDataType& opData) - Implements exec(const Point& thread, const ParamsType& params) + Implements exec(const Point thread, const OperationDataType& opData) + Implements exec(const Point thread, const ParamsType& params) Implements build(const OperationDataType& opData) Implements build(const ParamsType& params) */ @@ -182,16 +182,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = Operation::THREAD_FUSION; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -199,11 +199,11 @@ namespace fk { } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { return exec(thread, opData.params); } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) { if constexpr (THREAD_FUSION) { return Operation::template exec(thread, params.opData[thread.z]); } else { @@ -239,16 +239,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = false; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -256,11 +256,11 @@ namespace fk { } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { return exec(thread, opData.params); } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) { if (params.usedPlanes <= thread.z) { return params.default_value; } else { @@ -304,16 +304,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = Operation::THREAD_FUSION; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -352,16 +352,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = false; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -472,8 +472,8 @@ namespace fk { DECLARE_WRITE_PARENT_BASIC template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { if constexpr (THREAD_FUSION) { Operation::template exec(thread, input, params[thread.z]); @@ -481,10 +481,10 @@ namespace fk { Operation::exec(thread, input, params[thread.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params[thread.z]); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params[thread.z]); } // Build WriteBatch from array of IOps diff --git a/include/fused_kernel/core/execution_model/operation_model/fused_operation.h b/include/fused_kernel/core/execution_model/operation_model/fused_operation.h index 1394f947..7bbcda13 100644 --- a/include/fused_kernel/core/execution_model/operation_model/fused_operation.h +++ b/include/fused_kernel/core/execution_model/operation_model/fused_operation.h @@ -55,14 +55,14 @@ namespace fk { DECLARE_OPEN_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const InputType input, const ParamsType& params) { return exec_helper(std::make_index_sequence{}, thread, input, params); } private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const Point& thread, - const InputType& input, + const Point thread, + const InputType input, const ParamsType& params) { return (InputFoldType{thread, input} | ... | get_opt(params)).input; } @@ -88,21 +88,21 @@ namespace fk { DECLARE_READ_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { return exec_helper(std::make_index_sequence{}, thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_x(thread, get_opt<0>(opData.params)); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_y(thread, get_opt<0>(opData.params)); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_z(thread, get_opt<0>(opData.params)); } @@ -114,7 +114,7 @@ namespace fk { private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const Point& thread, + const Point thread, const ParamsType& params) { return (thread | ... | get_opt(params)).input; } @@ -131,13 +131,13 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_UNARY_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return exec_helper(std::make_index_sequence::size>{}, input); } private: template - FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, const InputType input) { constexpr OperationTuple poTup{}; // Optimization, we use a version of operator| that does not use InputTypeFold, // thus it does not propagate Point thread, because it is not needed. @@ -158,13 +158,13 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_CLOSED_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ParamsType ¶ms) { exec_helper(std::make_index_sequence{}, thread, params); } private: template - FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence&, const Point &thread, + FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence&, const Point thread, const ParamsType ¶ms) { LastType_t::Operation::exec(thread, (thread | ... | get_opt(params)).input, @@ -189,14 +189,14 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_WRITE_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const InputType& input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType ¶ms) { exec_helper(std::make_index_sequence{}, thread, input, params); } private: template - FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence &, const Point &thread, - const InputType& input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence &, const Point thread, + const InputType input, const ParamsType ¶ms) { LastType_t::Operation::exec( thread, (InputFoldType<>::build(thread, input) | ... | get_opt(params)).input, get_opt(params)); @@ -217,14 +217,14 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_BINARY_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType ¶ms) { return exec_helper(std::make_index_sequence{}, input, params); } private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const InputType& input, const ParamsType& params) { + const InputType input, const ParamsType& params) { // Optimization, we use a version of operator| that does not use InputTypeFold, // thus it does not propagate Point thread, because it is not needed. return (input | ... | get_opt(params)); diff --git a/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h b/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h index afe55b3d..d821603a 100644 --- a/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h @@ -64,14 +64,14 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c Point thread; InputType input; - FK_HOST_DEVICE_CNST InputFoldType(const Point& thread_, const InputType& input_) + FK_HOST_DEVICE_CNST InputFoldType(const Point thread_, const InputType input_) : thread(thread_), input(input_) {} }; template <> struct InputFoldType { template - FK_HOST_DEVICE_FUSE auto build(const Point& thread, InputT&& input) { + FK_HOST_DEVICE_FUSE auto build(const Point thread, InputT&& input) { return InputFoldType>(thread, std::forward(input)); } }; @@ -84,7 +84,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c return Operation::getActiveThreads(*this); } - FK_HOST_DEVICE_CNST friend auto operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend auto operator|(const Point thread, const OperationData& opData) { return InputFoldType<>::build(thread, Operation::exec(thread, opData)); } @@ -107,7 +107,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c return Fuser_t::fuse(std::forward(prevIOp), self); } - FK_HOST_DEVICE_CNST friend auto operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend auto operator|(const Point thread, const OperationData& opData) { return InputFoldType<>::build(thread, Operation::exec(thread, opData)); } }; @@ -134,7 +134,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * It can be composed of a single Operation or of a chain of Operations, in which case it wraps them into an * FusedOperation. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input, const OperationData& opDat) + * OutputType exec(const InputType input, const OperationData& opDat) */ template struct BinaryInstantiableOperation final : public OperationData { @@ -164,7 +164,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * Third parameter (back_function): it's a IOp that will be used at some point in the implementation of the * Operation. It can be any kind of IOp. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input, const OperationData& opData) + * OutputType exec(const InputType input, const OperationData& opData) */ template struct TernaryInstantiableOperation final : public OperationData { @@ -194,7 +194,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * It allows to execute the Operation (or chain of Unary Operations) on the input, and returns the result as output * in register memory. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input) + * OutputType exec(const InputType input) */ template struct UnaryInstantiableOperation { @@ -275,7 +275,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c struct ClosedInstantiableOperation final : public OperationData { INSTANTIABLE_OPERATION_DETAILS_IS_ASSERT(ClosedType) - FK_HOST_DEVICE_CNST friend void operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend void operator|(const Point thread, const OperationData& opData) { Operation::exec(thread, opData); } }; diff --git a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h index e74f3cd3..a2be6f16 100644 --- a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h +++ b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h @@ -59,6 +59,17 @@ namespace fk { template using OperationTuple = OperationTuple_; + // Primary template: defaults to false + template + struct IsOpTuple : std::false_type {}; + + // Partial specialization: matches any specialization of Tuple + template + struct IsOpTuple> : std::true_type {}; + + template + constexpr bool isOpTuple_v = IsOpTuple>::value; + template struct GetIndex; @@ -76,17 +87,21 @@ namespace fk { template struct GetIndex, IdxValue> { - static constexpr size_t value = GetIndexHelper, decltype(std::make_index_sequence()), IdxValue>::value; + static constexpr size_t value = GetIndexHelper, + decltype(std::make_index_sequence()), IdxValue>::value; }; - // As observed in get<>(Tuple<...>), returning a const& as auto, - // may lead to local memory accesses in the GPU - template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple& opTuple){ - if constexpr (opIs>>) { - return typename TypeAt_t>::Operation::InstantiableType{}; + template + FK_HOST_DEVICE_CNST decltype(auto) get_opt(OpTuple&& opTuple) { + static_assert(isOpTuple_v>, "get_opt only works with OperationTuple."); + if constexpr (opIs::Operations>>) { + // Unary types are stateless, return a new temporary by value. + // decltype(auto) deduces this as 'T' (Value). + return typename TypeAt_t::Operations>::Operation::InstantiableType{}; } else { - return get::Indexes, Idx>::value>(opTuple.instances); + // Stored types return whatever is stored in the OpTuple + return get::Indexes, Idx>::value>( + std::forward(opTuple).instances); } } diff --git a/include/fused_kernel/core/execution_model/operation_model/parent_operations.h b/include/fused_kernel/core/execution_model/operation_model/parent_operations.h index 9882c0c7..c5c814f7 100644 --- a/include/fused_kernel/core/execution_model/operation_model/parent_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/parent_operations.h @@ -78,7 +78,7 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType &opData) { \ return exec(input, opData.params); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType& opData) { return {opData}; } \ @@ -110,7 +110,7 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType &opData) { \ return exec(input, opData.params, opData.backIOp); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return {opData}; } \ @@ -146,7 +146,7 @@ namespace fk { static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ template \ - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { \ + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { \ if constexpr (std::bool_constant::value) { \ return exec(thread, opData.params); \ } else { \ @@ -186,7 +186,7 @@ namespace fk { static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ template \ - FK_HOST_DEVICE_FUSE void exec(const Point &thread, \ + FK_HOST_DEVICE_FUSE void exec(const Point thread, \ const ThreadFusionType &input, \ const OperationDataType &opData) { \ if constexpr (THREAD_FUSION) { \ @@ -222,8 +222,8 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, \ - const InputType& input, \ + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, \ + const InputType input, \ const OperationDataType& opData) { \ return exec(thread, input, opData.params); \ } \ @@ -251,7 +251,7 @@ struct ClosedOperation { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE void exec(const Point thread, const OperationDataType &opData) { \ exec(thread, opData.params); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return { opData }; } \ @@ -285,7 +285,7 @@ struct ClosedOperation { using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ - FK_HOST_DEVICE_FUSE OutputType exec(const Point &thread, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const OperationDataType &opData) { \ return exec(thread, opData.params, opData.backIOp); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return {opData}; } \ @@ -328,22 +328,22 @@ struct ClosedOperation { struct NumElems { template - FK_HOST_DEVICE_FUSE uint x(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint x(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::x"); return IOp::Operation::num_elems_x(thread, iOp); } template - FK_HOST_DEVICE_FUSE uint y(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint y(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::y"); return IOp::Operation::num_elems_y(thread, iOp); } template - FK_HOST_DEVICE_FUSE Size size(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE Size size(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::size"); return Size(x(thread, iOp), y(thread, iOp)); } template - FK_HOST_DEVICE_FUSE uint z(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint z(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::z"); return IOp::Operation::num_elems_z(thread, iOp); } diff --git a/include/fused_kernel/core/execution_model/operation_model/vector_operations.h b/include/fused_kernel/core/execution_model/operation_model/vector_operations.h index c223fbe5..228c9b85 100644 --- a/include/fused_kernel/core/execution_model/operation_model/vector_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/vector_operations.h @@ -32,7 +32,7 @@ namespace fk { using InputType = I; using OutputType = O; using InstanceType = UnaryType; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(cn == cn, "Unary struct requires same number of channels for input and output types."); constexpr bool allCUDAOrNotCUDA = @@ -73,7 +73,7 @@ namespace fk { using InputType = I; using OutputType = O; using InstanceType = UnaryType; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { const auto input1 = get<0>(input); const auto input2 = get<1>(input); using I1 = get_t<0, I>; @@ -138,10 +138,10 @@ namespace fk { using ParamsType = P; using InstanceType = BinaryType; using OperationDataType = OperationData>; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType& opData) { return BinaryV::exec(input, opData.params); } - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(cn == cn, "Binary struct requires same number of channels for input and output types."); constexpr bool allCUDAOrNotCUDA = diff --git a/tests/examples/inlining_and_LDL_STL.h b/tests/examples/inlining_and_LDL_STL.h index a465ca56..184badb7 100644 --- a/tests/examples/inlining_and_LDL_STL.h +++ b/tests/examples/inlining_and_LDL_STL.h @@ -57,7 +57,7 @@ struct SimpleTransformDPPBaseReference { friend struct SimpleTransformDPPReference; // Allow TransformDPP to access private members private: template - FK_HOST_DEVICE_FUSE void execute_thread(const Point& thread, const ReadIOp& readDF, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ReadIOp& readDF, const IOps&... iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; @@ -82,7 +82,7 @@ struct SimpleTransformDPPBaseReferenceFoldExpr { // to access private members private: template - FK_HOST_DEVICE_FUSE void execute_thread(const Point &thread, const ReadIOp &readDF, const IOps &...iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ReadIOp &readDF, const IOps &...iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; diff --git a/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h b/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h index 58bebef8..ccd3e8d5 100644 --- a/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h +++ b/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h @@ -24,7 +24,7 @@ constexpr T halfPositiveRange() { } template -constexpr OutputType expectedPositiveValue(const InputType &input) { +constexpr OutputType expectedPositiveValue(const InputType input) { if (cxp::cmp_greater::f(input, fk::maxValue>)) { return fk::maxValue; } else {