From fd8274dc9551b87a2f1b0535dc8eb8602f79bc9c Mon Sep 17 00:00:00 2001 From: Oscar Amoros Huguet Date: Wed, 18 Feb 2026 12:05:00 +0100 Subject: [PATCH 1/3] Making get_opt generic like get --- .../operation_model/operation_tuple.h | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h index e74f3cd3..408910b1 100644 --- a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h +++ b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h @@ -79,17 +79,54 @@ namespace fk { static constexpr size_t value = GetIndexHelper, decltype(std::make_index_sequence()), IdxValue>::value; }; - // As observed in get<>(Tuple<...>), returning a const& as auto, - // may lead to local memory accesses in the GPU + // 1. Mutable L-value (Allows modification: get_opt<0>(tup) = x;) template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple& opTuple){ + FK_HOST_DEVICE_CNST decltype(auto) get_opt(OperationTuple& opTuple) { if constexpr (opIs>>) { + // Unary types are stateless, return a new temporary by value. + // decltype(auto) deduces this as 'T' (Value). return typename TypeAt_t>::Operation::InstantiableType{}; } else { + // Stored types return a reference to the data. + // get(...) returns 'auto&', so decltype(auto) returns 'T&'. return get::Indexes, Idx>::value>(opTuple.instances); } } + // 2. Const L-value (Read-only access: safe for const objects) + template + FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple& opTuple) { + if constexpr (opIs>>) { + return typename TypeAt_t>::Operation::InstantiableType{}; + } else { + // get(...) returns 'const auto&', so decltype(auto) returns 'const T&'. + return get::Indexes, Idx>::value>(opTuple.instances); + } + } + + // 3. Mutable R-value (Move semantics: allows stealing resources) + template + FK_HOST_DEVICE_CNST decltype(auto) get_opt(OperationTuple&& opTuple) { + if constexpr (opIs>>) { + return typename TypeAt_t>::Operation::InstantiableType{}; + } else { + // We must std::move the internal tuple member to propagate the r-value nature. + // get(...) returns 'auto&&', so decltype(auto) returns 'T&&'. + return get::Indexes, Idx>::value>(std::move(opTuple.instances)); + } + } + + // 4. Const R-value (Rare, but technically correct for completeness) + template + FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple&& opTuple) { + if constexpr (opIs>>) { + return typename TypeAt_t>::Operation::InstantiableType{}; + } else { + // get(...) returns 'const auto&&'. + return get::Indexes, Idx>::value>(std::move(opTuple.instances)); + } + } + template FK_HOST_CNST auto make_new_operation_tuple_helper(const std::index_sequence&, const IOps&... iOps) { // 1. Pack arguments into a tuple ONCE. From 952eff1769eacc045bdbe17cac29ac9a09e331d4 Mon Sep 17 00:00:00 2001 From: Oscar Amoros Huguet Date: Wed, 18 Feb 2026 12:36:19 +0100 Subject: [PATCH 2/3] Simplified get wrapper and get_opt implementations --- include/fused_kernel/core/data/tuple.h | 23 ++----- .../operation_model/operation_tuple.h | 64 ++++++------------- 2 files changed, 27 insertions(+), 60 deletions(-) diff --git a/include/fused_kernel/core/data/tuple.h b/include/fused_kernel/core/data/tuple.h index 0fe35484..53df2b9b 100644 --- a/include/fused_kernel/core/data/tuple.h +++ b/include/fused_kernel/core/data/tuple.h @@ -251,24 +251,13 @@ namespace fk { } }; - template - FK_HOST_DEVICE_CNST const auto& get(const Tuple& tuple) { - return TupleUtil::get(tuple); - } - - template - FK_HOST_DEVICE_CNST auto& get(Tuple& tuple) { - return TupleUtil::get(tuple); - } - - template - FK_HOST_DEVICE_CNST auto&& get(Tuple&& tuple) { - return TupleUtil::get(std::forward>(tuple)); - } + template + FK_HOST_DEVICE_CNST decltype(auto) get(TupleLike&& tuple) { + static_assert(isTuple_v, "fk::get can only be used with fk::Tuple"); - template - FK_HOST_DEVICE_CNST const auto&& get(const Tuple&& tuple) { - return TupleUtil::get(std::move(tuple)); + // decltype(auto) + std::forward preserves EXACTLY what TupleUtil returned + // (Value category, const-ness, and reference type) + return TupleUtil::get(std::forward(tuple)); } template diff --git a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h index 408910b1..a2be6f16 100644 --- a/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h +++ b/include/fused_kernel/core/execution_model/operation_model/operation_tuple.h @@ -59,6 +59,17 @@ namespace fk { template using OperationTuple = OperationTuple_; + // Primary template: defaults to false + template + struct IsOpTuple : std::false_type {}; + + // Partial specialization: matches any specialization of Tuple + template + struct IsOpTuple> : std::true_type {}; + + template + constexpr bool isOpTuple_v = IsOpTuple>::value; + template struct GetIndex; @@ -76,54 +87,21 @@ namespace fk { template struct GetIndex, IdxValue> { - static constexpr size_t value = GetIndexHelper, decltype(std::make_index_sequence()), IdxValue>::value; + static constexpr size_t value = GetIndexHelper, + decltype(std::make_index_sequence()), IdxValue>::value; }; - // 1. Mutable L-value (Allows modification: get_opt<0>(tup) = x;) - template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(OperationTuple& opTuple) { - if constexpr (opIs>>) { + template + FK_HOST_DEVICE_CNST decltype(auto) get_opt(OpTuple&& opTuple) { + static_assert(isOpTuple_v>, "get_opt only works with OperationTuple."); + if constexpr (opIs::Operations>>) { // Unary types are stateless, return a new temporary by value. // decltype(auto) deduces this as 'T' (Value). - return typename TypeAt_t>::Operation::InstantiableType{}; - } else { - // Stored types return a reference to the data. - // get(...) returns 'auto&', so decltype(auto) returns 'T&'. - return get::Indexes, Idx>::value>(opTuple.instances); - } - } - - // 2. Const L-value (Read-only access: safe for const objects) - template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple& opTuple) { - if constexpr (opIs>>) { - return typename TypeAt_t>::Operation::InstantiableType{}; - } else { - // get(...) returns 'const auto&', so decltype(auto) returns 'const T&'. - return get::Indexes, Idx>::value>(opTuple.instances); - } - } - - // 3. Mutable R-value (Move semantics: allows stealing resources) - template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(OperationTuple&& opTuple) { - if constexpr (opIs>>) { - return typename TypeAt_t>::Operation::InstantiableType{}; - } else { - // We must std::move the internal tuple member to propagate the r-value nature. - // get(...) returns 'auto&&', so decltype(auto) returns 'T&&'. - return get::Indexes, Idx>::value>(std::move(opTuple.instances)); - } - } - - // 4. Const R-value (Rare, but technically correct for completeness) - template - FK_HOST_DEVICE_CNST decltype(auto) get_opt(const OperationTuple&& opTuple) { - if constexpr (opIs>>) { - return typename TypeAt_t>::Operation::InstantiableType{}; + return typename TypeAt_t::Operations>::Operation::InstantiableType{}; } else { - // get(...) returns 'const auto&&'. - return get::Indexes, Idx>::value>(std::move(opTuple.instances)); + // Stored types return whatever is stored in the OpTuple + return get::Indexes, Idx>::value>( + std::forward(opTuple).instances); } } From e536e14c3caa8022b55b18408344966e0bbfd7f6 Mon Sep 17 00:00:00 2001 From: Oscar Amoros Huguet Date: Wed, 18 Feb 2026 16:51:22 +0100 Subject: [PATCH 3/3] Made all values that reside in registers, to be passed as const value, instead of const reference, to be more consistent with the reality of the variables and values. If nvcc where very strict, or there where no inlining in one function, it would trigger local memory usage. --- .../algorithms/basic_ops/algebraic.h | 4 +- .../algorithms/basic_ops/arithmetic.h | 10 +- .../fused_kernel/algorithms/basic_ops/cast.h | 2 +- .../algorithms/basic_ops/logical.h | 18 +-- .../algorithms/basic_ops/memory_operations.h | 108 +++++++++--------- .../fused_kernel/algorithms/basic_ops/set.h | 8 +- .../algorithms/basic_ops/static_loop.h | 4 +- .../algorithms/basic_ops/vector_ops.h | 12 +- .../image_processing/border_reader.h | 36 +++--- .../image_processing/color_conversion.h | 40 +++---- .../algorithms/image_processing/crop.h | 14 +-- .../algorithms/image_processing/deinterlace.h | 20 ++-- .../algorithms/image_processing/image.h | 4 +- .../image_processing/interpolation.h | 8 +- .../algorithms/image_processing/resize.h | 24 ++-- .../algorithms/image_processing/saturate.h | 6 +- .../algorithms/image_processing/warping.h | 16 +-- include/fused_kernel/core/data/ptr_nd.h | 12 +- include/fused_kernel/core/data/rawptr.h | 28 ++--- include/fused_kernel/core/data/rect.h | 4 +- .../execution_model/data_parallel_patterns.h | 46 +++----- .../operation_model/batch_operations.h | 56 ++++----- .../operation_model/fused_operation.h | 34 +++--- .../operation_model/instantiable_operations.h | 16 +-- .../operation_model/parent_operations.h | 24 ++-- .../operation_model/vector_operations.h | 8 +- tests/examples/inlining_and_LDL_STL.h | 4 +- .../utest_saturate/utest_saturate_common.h | 2 +- 28 files changed, 274 insertions(+), 294 deletions(-) diff --git a/include/fused_kernel/algorithms/basic_ops/algebraic.h b/include/fused_kernel/algorithms/basic_ops/algebraic.h index 3b192751..2fce9834 100644 --- a/include/fused_kernel/algorithms/basic_ops/algebraic.h +++ b/include/fused_kernel/algorithms/basic_ops/algebraic.h @@ -38,7 +38,7 @@ namespace fk { using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { const float3 xOut = input * params.x; const float3 yOut = input * params.y; const float3 zOut = input * params.z; @@ -54,7 +54,7 @@ namespace fk { FK_STATIC_STRUCT(MxVFloat3, SelfType) using Parent = UnaryOperation, float3, MxVFloat3>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { const float3 xOut = get<0>(input) * get<1>(input).x; const float3 yOut = get<0>(input) * get<1>(input).y; const float3 zOut = get<0>(input) * get<1>(input).z; diff --git a/include/fused_kernel/algorithms/basic_ops/arithmetic.h b/include/fused_kernel/algorithms/basic_ops/arithmetic.h index 02de435b..a07a0d04 100644 --- a/include/fused_kernel/algorithms/basic_ops/arithmetic.h +++ b/include/fused_kernel/algorithms/basic_ops/arithmetic.h @@ -30,7 +30,7 @@ namespace fk { FK_STATIC_STRUCT(Add, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input + params; } }; @@ -43,7 +43,7 @@ namespace fk { FK_STATIC_STRUCT(Add, SelfType) using Parent = UnaryOperation, O, Add>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return get<0>(input) + get<1>(input); } }; @@ -56,7 +56,7 @@ namespace fk { FK_STATIC_STRUCT(Sub, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input - params; } }; @@ -69,7 +69,7 @@ namespace fk { FK_STATIC_STRUCT(Mul, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input * params; } }; @@ -82,7 +82,7 @@ namespace fk { FK_STATIC_STRUCT(Div, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return input / params; } }; diff --git a/include/fused_kernel/algorithms/basic_ops/cast.h b/include/fused_kernel/algorithms/basic_ops/cast.h index aaf76099..1627104d 100644 --- a/include/fused_kernel/algorithms/basic_ops/cast.h +++ b/include/fused_kernel/algorithms/basic_ops/cast.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(Cast, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::cast::f(input); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/logical.h b/include/fused_kernel/algorithms/basic_ops/logical.h index 8376704e..a6e07d83 100644 --- a/include/fused_kernel/algorithms/basic_ops/logical.h +++ b/include/fused_kernel/algorithms/basic_ops/logical.h @@ -29,7 +29,7 @@ namespace fk { FK_STATIC_STRUCT(IsEven, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE auto exec(const InputType& input) { + FK_HOST_DEVICE_FUSE auto exec(const InputType input) { return cxp::is_even::f(input); } }; @@ -42,7 +42,7 @@ namespace fk { FK_STATIC_STRUCT(Max, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return cxp::max::f(input, params); } }; @@ -55,7 +55,7 @@ namespace fk { FK_STATIC_STRUCT(Max, SelfType) using Parent = UnaryOperation, O, Max>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::max::f(get<0>(input), get<1>(input)); } }; @@ -68,7 +68,7 @@ namespace fk { FK_STATIC_STRUCT(Min, SelfType) using Parent = BinaryOperation>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return cxp::min::f(input, params); } }; @@ -81,7 +81,7 @@ namespace fk { FK_STATIC_STRUCT(Min, SelfType) using Parent = UnaryOperation, O, Min>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::min::f(get<0>(input), get<1>(input)); } }; @@ -95,21 +95,21 @@ namespace fk { using Parent = UnaryOperation, bool, Equal>; DECLARE_UNARY_PARENT template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { return get<0>(input) == get<1>(input); } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y; } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y && result.z; } template > - FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType& input) { + FK_HOST_DEVICE_FUSE std::enable_if_t exec(const InputType input) { const auto result = get<0>(input) == get<1>(input); return result.x && result.y && result.z && result.w; } diff --git a/include/fused_kernel/algorithms/basic_ops/memory_operations.h b/include/fused_kernel/algorithms/basic_ops/memory_operations.h index 844e52ca..91184730 100644 --- a/include/fused_kernel/algorithms/basic_ops/memory_operations.h +++ b/include/fused_kernel/algorithms/basic_ops/memory_operations.h @@ -32,16 +32,16 @@ namespace fk { FK_STATIC_STRUCT(PerThreadRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) -> ThreadFusionType { return *PtrAccessor::template cr_point>(thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { if constexpr (D == ND::_1D) { return 1; } else { @@ -49,7 +49,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { if constexpr (D == ND::_1D || D == ND::_2D) { return 1; } else { @@ -57,7 +57,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -90,15 +90,15 @@ namespace fk { FK_STATIC_STRUCT(PerThreadWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { *PtrAccessor::template point>(thread, params) = input; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } FK_HOST_FUSE InstantiableType build(const Ptr& ptr) { @@ -131,23 +131,23 @@ namespace fk { DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) -> ThreadFusionType { return *PtrAccessor::template cr_point>(thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -165,15 +165,15 @@ namespace fk { FK_STATIC_STRUCT(TensorWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const ThreadFusionType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ThreadFusionType input, const ParamsType& params) { *PtrAccessor::template point>(thread, params) = input; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -186,7 +186,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorSplit, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const T& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor write. It must be one of 2, 3 or 4."); @@ -203,10 +203,10 @@ namespace fk { *(work_plane + (planePixels * 3)) = input.w; } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -219,7 +219,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorTSplit, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor write. It must be one of 2, 3 or 4."); @@ -232,10 +232,10 @@ namespace fk { *PtrAccessor::point(thread, params, 3) = input.w; } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } }; @@ -248,7 +248,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorPack, SelfType) DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor read. It must be one of 2, 3 or 4."); @@ -268,19 +268,19 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } @@ -297,7 +297,7 @@ namespace fk { public: FK_STATIC_STRUCT(TensorTPack, SelfType) DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split tensor read. It must be one of 2, 3 or 4."); @@ -316,16 +316,16 @@ namespace fk { return make_(x, y, z, w); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dims.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dims.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.dims.planes; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.dims.pitch; } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -365,7 +365,7 @@ namespace fk { public: FK_STATIC_STRUCT(SplitWrite, SelfType) DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { static_assert(cn >= 2, "Wrong type for split write. It must be one of 2, 3 or 4."); *PtrAccessor::point(thread, params.x) = input.x; @@ -373,10 +373,10 @@ namespace fk { if constexpr (cn >= 3) *PtrAccessor::point(thread, params.z) = input.z; if constexpr (cn == 4) *PtrAccessor::point(thread, params.w) = input.w; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.x.dims.width; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return opData.params.x.dims.pitch; } @@ -420,7 +420,7 @@ namespace fk { namespace circular_batch_internal { template - FK_HOST_DEVICE_CNST Point computeCircularThreadIdx(const Point& currentIdx, const int& fst) { + FK_HOST_DEVICE_CNST Point computeCircularThreadIdx(const Point currentIdx, const int fst) { if constexpr (direction == CircularDirection::Ascendent) { const int z = currentIdx.z + fst; return { currentIdx.x, currentIdx.y, z >= BATCH ? z - BATCH : z }; @@ -444,7 +444,7 @@ namespace fk { FK_STATIC_STRUCT(CircularBatchRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point thread, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { return Operation::template exec(newThreadIdx, params.opData[newThreadIdx.z]); @@ -452,19 +452,19 @@ namespace fk { return Operation::exec(newThreadIdx, params.opData[newThreadIdx.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } @@ -486,7 +486,7 @@ namespace fk { FK_STATIC_STRUCT(CircularBatchWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const ThreadFusionType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ThreadFusionType input, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { Operation::template exec(newThreadIdx, input, params.opData[newThreadIdx.z]); @@ -494,10 +494,10 @@ namespace fk { Operation::exec(newThreadIdx, input, params.opData[newThreadIdx.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opBatch) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opBatch) { return Operation::num_elems_x(thread, opBatch.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opBatch) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opBatch) { return Operation::pitch(thread, opBatch.params.opData[thread.z]); } }; @@ -515,7 +515,7 @@ namespace fk { FK_STATIC_STRUCT(CircularTensorRead, SelfType) DECLARE_READ_PARENT template - FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE ThreadFusionType exec(const Point thread, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { return Operation::template exec(newThreadIdx, params.opData); @@ -523,19 +523,19 @@ namespace fk { return Operation::exec(newThreadIdx, params.opData); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData); } @@ -557,8 +557,8 @@ namespace fk { FK_STATIC_STRUCT(CircularTensorWrite, SelfType) DECLARE_WRITE_PARENT template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { const Point newThreadIdx = circular_batch_internal::computeCircularThreadIdx(thread, params.first); if constexpr (THREAD_FUSION) { @@ -567,10 +567,10 @@ namespace fk { Operation::exec(newThreadIdx, input, params.opData); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/set.h b/include/fused_kernel/algorithms/basic_ops/set.h index 40c86131..5a0c9a5b 100644 --- a/include/fused_kernel/algorithms/basic_ops/set.h +++ b/include/fused_kernel/algorithms/basic_ops/set.h @@ -34,19 +34,19 @@ namespace fk { FK_STATIC_STRUCT(ReadSet, SelfType) using Parent = ReadOperation, T, TF::DISABLED, ReadSet>; DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { return params.value; } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.size.x; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.size.y; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return opData.params.size.z; } diff --git a/include/fused_kernel/algorithms/basic_ops/static_loop.h b/include/fused_kernel/algorithms/basic_ops/static_loop.h index 321ead53..88c7f783 100644 --- a/include/fused_kernel/algorithms/basic_ops/static_loop.h +++ b/include/fused_kernel/algorithms/basic_ops/static_loop.h @@ -29,7 +29,7 @@ namespace fk { private: template - FK_DEVICE_FUSE OutputType helper_exec(const InputType& input, const ParamsType& params) { + FK_DEVICE_FUSE OutputType helper_exec(const InputType input, const ParamsType& params) { if constexpr (ITERATION + 1 < ITERATIONS) { return helper_exec(Operation::exec(input, params), params); } else { @@ -38,7 +38,7 @@ namespace fk { } public: - FK_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { return helper_exec<0>(Operation::exec(input, params), params); } }; diff --git a/include/fused_kernel/algorithms/basic_ops/vector_ops.h b/include/fused_kernel/algorithms/basic_ops/vector_ops.h index b3ccd25e..f4c97584 100644 --- a/include/fused_kernel/algorithms/basic_ops/vector_ops.h +++ b/include/fused_kernel/algorithms/basic_ops/vector_ops.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(Discard, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(cn > cn, "Output type should at least have one channel less"); static_assert(std::is_same_v, VBase>, "Base types should be the same"); @@ -48,7 +48,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReorder, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(validCUDAVec, "Non valid CUDA vetor type: UnaryVectorReorder"); static_assert(cn >= 2, "Minimum number of channels is 2: UnaryVectorReorder"); return {static_get(input)...}; @@ -63,7 +63,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReorderRT, SelfType) using Parent = BinaryOperation>, T, VectorReorderRT>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(validCUDAVec, "Non valid CUDA vetor type"); static_assert(cn >= 2, "Minimum number of channels is 2"); if constexpr (cn == 2) { @@ -86,7 +86,7 @@ namespace fk { FK_STATIC_STRUCT(VectorReduce, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::vector_reduce::f(input); } }; @@ -99,7 +99,7 @@ namespace fk { FK_STATIC_STRUCT(AddLast, SelfType) using Parent = BinaryOperation::base, O, AddLast>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(cn == cn -1, "Output type should have one channel more"); static_assert(std::is_same_v::base, typename VectorTraits::base>, "Base types should be the same"); @@ -125,7 +125,7 @@ namespace fk { FK_STATIC_STRUCT(VectorAnd, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::vector_and::f(input); } }; diff --git a/include/fused_kernel/algorithms/image_processing/border_reader.h b/include/fused_kernel/algorithms/image_processing/border_reader.h index e8797f6f..a681136f 100644 --- a/include/fused_kernel/algorithms/image_processing/border_reader.h +++ b/include/fused_kernel/algorithms/image_processing/border_reader.h @@ -56,15 +56,15 @@ namespace fk { using Parent = IncompleteReadBackOperation; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -93,15 +93,15 @@ namespace fk { using Parent = IncompleteReadBackOperation, NullType, NullType, SelfType>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -127,15 +127,15 @@ namespace fk { using Parent = IncompleteReadBackOperation, NullType, NullType, SelfType>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -165,18 +165,18 @@ public: \ using Parent = ReadBackOperation, \ BackIOp_, typename BackIOp_::Operation::OutputType, SelfType>; \ DECLARE_READBACK_PARENT \ -FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_x(thread, opData.backIOp); \ } \ -FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_y(thread, opData.backIOp); \ } \ -FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { \ +FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { \ return BackIOp::Operation::num_elems_z(thread, opData.backIOp); \ } #define BORDER_READER_EXEC \ -FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { \ +FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { \ const int last_col = BackIOp::Operation::num_elems_x(thread, backIOp) - 1; \ const int last_row = BackIOp::Operation::num_elems_y(thread, backIOp) - 1; \ const Point new_thread{idx_col(thread.x, last_col), idx_row(thread.y, last_row), thread.z}; \ @@ -197,17 +197,17 @@ FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& param BackIOp_, ReadAndOutputType, SelfType>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_z(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const int width = BackIOp::Operation::num_elems_x(thread, backIOp); const int height = BackIOp::Operation::num_elems_y(thread, backIOp); if (thread.x >= 0 && thread.x < width && thread.y >= 0 && thread.y < height) { @@ -274,7 +274,7 @@ FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& param struct BorderReader, BackIOp_, std::enable_if_t, void>> { BORDER_READER_DETAILS(BorderType::WRAP) - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const int width = BackIOp::Operation::num_elems_x(thread, backIOp); const int height = BackIOp::Operation::num_elems_y(thread, backIOp); const Point new_thread{idx_col(thread.x, width), idx_row(thread.y, height), thread.z}; diff --git a/include/fused_kernel/algorithms/image_processing/color_conversion.h b/include/fused_kernel/algorithms/image_processing/color_conversion.h index 7eaaf6cb..3f667278 100644 --- a/include/fused_kernel/algorithms/image_processing/color_conversion.h +++ b/include/fused_kernel/algorithms/image_processing/color_conversion.h @@ -34,7 +34,7 @@ namespace fk { FK_STATIC_STRUCT(StaticAddAlpha, SelfType) using Parent = UnaryOperation, StaticAddAlpha>; DECLARE_UNARY_PARENT - FK_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_DEVICE_FUSE OutputType exec(const InputType input) { return AddLast::exec(input, { alpha }); } }; @@ -52,7 +52,7 @@ namespace fk { FK_STATIC_STRUCT(RGB2Gray, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { // 0.299*R + 0.587*G + 0.114*B if constexpr (std::is_unsigned_v) { #ifdef __CUDA_ARCH__ @@ -71,7 +71,7 @@ namespace fk { } } private: - FK_HOST_DEVICE_FUSE float compute_luminance(const InputType& input) { + FK_HOST_DEVICE_FUSE float compute_luminance(const InputType input) { return (input.x * 0.299f) + (input.y * 0.587f) + (input.z * 0.114f); } }; @@ -110,7 +110,7 @@ namespace fk { FK_STATIC_STRUCT(AddOpaqueAlpha, SelfType) using Parent = UnaryOperation, AddOpaqueAlpha>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { constexpr auto alpha = maxDepthValue; return AddLast::exec(input, { alpha }); } @@ -124,7 +124,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateDepth, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return Saturate::exec(input, { { 0.f, static_cast(maxDepthValue) } }); } }; @@ -192,7 +192,7 @@ namespace fk { FK_STATIC_STRUCT(DenormalizePixel, SelfType) using Parent = UnaryOperation>, O, DenormalizePixel>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { constexpr auto maxDepth = maxDepthValue; return cxp::cast::f(input * maxDepth); } @@ -206,7 +206,7 @@ namespace fk { FK_STATIC_STRUCT(NormalizePixel, SelfType) using Parent = UnaryOperation>, NormalizePixel>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return input / static_cast(maxDepthValue); } }; @@ -219,7 +219,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateDenormalizePixel, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_same_v, float>, "SaturateDenormalizePixel only works with float base types."); const InputType saturatedFloat = SaturateFloat::exec(input); return DenormalizePixel::exec(saturatedFloat); @@ -235,7 +235,7 @@ namespace fk { using Parent = UnaryOperation>; DECLARE_UNARY_PARENT using Base = typename VectorTraits::base; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_floating_point_v>, "NormalizeColorRangeDepth only works for floating point values"); // The nvcc compiler will only be able to use the global constexpr floatShiftFactor variable if it is stored in // a local variable. @@ -262,7 +262,7 @@ namespace fk { // Y -> input.x // Cb(U) -> input.y // Cr(V) -> input.z - FK_HOST_DEVICE_FUSE float3 computeRGB(const InputType& pixel) { + FK_HOST_DEVICE_FUSE float3 computeRGB(const InputType pixel) { constexpr M3x3Float coefficients = ccMatrix; constexpr float CSub = subCoefficients.chroma; if constexpr (CP == ColorPrimitives::bt601) { @@ -273,7 +273,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType computePixel(const InputType& pixel) { + FK_HOST_DEVICE_FUSE OutputType computePixel(const InputType pixel) { const float3 pixelRGBFloat = computeRGB(pixel); if constexpr (std::is_same_v, float>) { if constexpr (ALPHA) { @@ -293,7 +293,7 @@ namespace fk { } public: - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { // Pixel data shifted to the right to it's color depth numerical range constexpr auto shiftFactorLocal = shiftFactor; const InputType shiftedPixel = input >> shiftFactorLocal; @@ -326,7 +326,7 @@ namespace fk { TF::DISABLED, ReadYUV>; DECLARE_READ_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { const auto rawPtr = params.data; if constexpr (PF == PixelFormat::NV12 || PF == PixelFormat::P010 || PF == PixelFormat::P016 || PF == PixelFormat::P210 || @@ -382,15 +382,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -416,7 +416,7 @@ namespace fk { TF::DISABLED, WriteYUV>; DECLARE_WRITE_PARENT - FK_HOST_DEVICE_FUSE void exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType& params) { const auto rawPtr = params.data; if constexpr (PF == PixelFormat::NV12 || PF == PixelFormat::P010 || PF == PixelFormat::P016 || PF == PixelFormat::P210 || @@ -494,15 +494,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/crop.h b/include/fused_kernel/algorithms/image_processing/crop.h index 5f1e66ef..76cf788f 100644 --- a/include/fused_kernel/algorithms/image_processing/crop.h +++ b/include/fused_kernel/algorithms/image_processing/crop.h @@ -33,20 +33,20 @@ namespace fk { typename BackIOp_::Operation::OutputType, Crop>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const Point newThread{thread.x + static_cast(params.x), thread.y + static_cast(params.y)}; return BackIOp::Operation::exec(newThread, backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -68,15 +68,15 @@ namespace fk { using Parent = IncompleteReadBackOperation>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/deinterlace.h b/include/fused_kernel/algorithms/image_processing/deinterlace.h index 3694c17b..fa1bb7a0 100644 --- a/include/fused_kernel/algorithms/image_processing/deinterlace.h +++ b/include/fused_kernel/algorithms/image_processing/deinterlace.h @@ -51,7 +51,7 @@ namespace fk { Deinterlace>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { if constexpr (DType == DeinterlaceType::BLEND) { return execBlend(thread, params, backIOp); } else { // INTER_LINEAR @@ -59,15 +59,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -75,7 +75,7 @@ namespace fk { return { num_elems_x(Point{0,0,0}, opData), num_elems_y(Point{0,0,0}, opData), num_elems_z(Point{0,0,0}, opData) }; } private: - FK_HOST_DEVICE_FUSE OutputType execBlend(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType execBlend(const Point thread, const ParamsType& params, const BackIOp& backIOp) { // For blend deinterlacing, we average the current line with adjacent lines using ReadOperation = typename BackIOp::Operation; @@ -90,7 +90,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType execInterLinearGetPixel(const Point& thread, const BackIOp& backIOp, const bool& interpolate) { + FK_HOST_DEVICE_FUSE OutputType execInterLinearGetPixel(const Point thread, const BackIOp& backIOp, const bool& interpolate) { using ReadOperation = typename BackIOp::Operation; if (interpolate) { // We average the above pixel with the below pixel @@ -102,7 +102,7 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE OutputType execInterLinear(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType execInterLinear(const Point thread, const ParamsType& params, const BackIOp& backIOp) { using ReadOperation = typename BackIOp::Operation; // Assuming BackFunction::Operation::num_elems_y(Point{0,0,0}, backIOp) is an even number @@ -129,15 +129,15 @@ namespace fk { template using NewInstantiableType = ReadBack>; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/image.h b/include/fused_kernel/algorithms/image_processing/image.h index f5e11372..90cb1c7b 100644 --- a/include/fused_kernel/algorithms/image_processing/image.h +++ b/include/fused_kernel/algorithms/image_processing/image.h @@ -58,7 +58,7 @@ namespace fk { return ptr(); } - FK_HOST_CNST Image crop(const Point& p, const uint& newWidth, const uint& newHeight) { + FK_HOST_CNST Image crop(const Point p, const uint& newWidth, const uint& newHeight) { const uint newDataWidth = newWidth * PixelFormatTraits::rf.width_f; const uint newDataHeight = newHeight * PixelFormatTraits::rf.height_f; const Point dataPoint{p.x * PixelFormatTraits::rf.width_f, p.y * PixelFormatTraits::rf.height_f, 0}; @@ -87,7 +87,7 @@ namespace fk { #endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) #endif // defined(NVRTC_COMPILER) - FK_HOST_CNST VectorType_t::cn> readAt(const Point& p) const { + FK_HOST_CNST VectorType_t::cn> readAt(const Point p) const { return ReadYUV::exec(p, ptr()); } }; diff --git a/include/fused_kernel/algorithms/image_processing/interpolation.h b/include/fused_kernel/algorithms/image_processing/interpolation.h index b195fe09..edb76ff3 100644 --- a/include/fused_kernel/algorithms/image_processing/interpolation.h +++ b/include/fused_kernel/algorithms/image_processing/interpolation.h @@ -55,19 +55,19 @@ namespace fk { SelfType>; DECLARE_TERNARY_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_x(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return BackIOp::Operation::num_elems_y(thread, opData.backIOp); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params, const BackIOp& backIOp) { const float src_x = input.x; const float src_y = input.y; diff --git a/include/fused_kernel/algorithms/image_processing/resize.h b/include/fused_kernel/algorithms/image_processing/resize.h index f76cd047..b760e980 100644 --- a/include/fused_kernel/algorithms/image_processing/resize.h +++ b/include/fused_kernel/algorithms/image_processing/resize.h @@ -27,7 +27,7 @@ namespace fk { FK_STATIC_STRUCT(ComputeResizePoint, ComputeResizePoint) using Parent = BinaryOperation; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType thread, const ParamsType& params) { // This is what makes the interpolation a resize operation const float fx = params.x; const float fy = params.y; @@ -83,7 +83,7 @@ namespace fk { SelfType>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { if constexpr (AR == AspectRatio::IGNORE_AR) { return exec_resize(thread, params, backIOp); } else { @@ -97,15 +97,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -114,7 +114,7 @@ namespace fk { } private: - FK_HOST_DEVICE_FUSE auto exec_resize(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE auto exec_resize(const Point thread, const ParamsType& params, const BackIOp& backIOp) { static_assert(opIs, "BackIOp must be a ternary type for this specialization"); const float src_x = thread.x * params.src_conv_factors.x; @@ -138,13 +138,13 @@ namespace fk { template using NewInstantiableType = ResizeComplete>>; - FK_HOST_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -249,13 +249,13 @@ namespace fk { template using NewInstantiableType = ResizeComplete>>; - FK_HOST_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/algorithms/image_processing/saturate.h b/include/fused_kernel/algorithms/image_processing/saturate.h index 5fa1b0e8..c8097e62 100644 --- a/include/fused_kernel/algorithms/image_processing/saturate.h +++ b/include/fused_kernel/algorithms/image_processing/saturate.h @@ -30,7 +30,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateCast, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return cxp::saturate_cast::f(input); } }; @@ -43,7 +43,7 @@ namespace fk { FK_STATIC_STRUCT(Saturate, SelfType) using Parent = BinaryOperation, 2>, T, Saturate>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(!validCUDAVec, "Saturate only works with non cuda vector types"); return cxp::max::f(cxp::min::f(input, params.y), params.x); } @@ -57,7 +57,7 @@ namespace fk { FK_STATIC_STRUCT(SaturateFloat, SelfType) using Parent = UnaryOperation>; DECLARE_UNARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(std::is_same_v, float>, "Saturate float only works with float base types."); return cxp::max::f(make_set(0.f), cxp::min::f(input, make_set(1.f)));; diff --git a/include/fused_kernel/algorithms/image_processing/warping.h b/include/fused_kernel/algorithms/image_processing/warping.h index 5b206e55..2da8cdf2 100644 --- a/include/fused_kernel/algorithms/image_processing/warping.h +++ b/include/fused_kernel/algorithms/image_processing/warping.h @@ -47,7 +47,7 @@ namespace fk { FK_STATIC_STRUCT(WarpingCoords, SelfType) using Parent = BinaryOperation, float2, WarpingCoords>; DECLARE_BINARY_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType thread, const ParamsType& params) { const int x = thread.x; const int y = thread.y; const auto& transMatRaw = params.transformMatrix.data; @@ -80,7 +80,7 @@ namespace fk { float_>, Warping>; DECLARE_READBACK_PARENT - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) { const float2 coord = WarpingCoords::exec(thread, params); const Size sourceSize(BackIOp::Operation::num_elems_x(thread, backIOp), BackIOp::Operation::num_elems_y(thread, backIOp)); @@ -91,15 +91,15 @@ namespace fk { } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } @@ -121,15 +121,15 @@ namespace fk { Warping>; DECLARE_INCOMPLETEREADBACK_PARENT - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.width; } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return opData.params.dstSize.height; } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return 1; } diff --git a/include/fused_kernel/core/data/ptr_nd.h b/include/fused_kernel/core/data/ptr_nd.h index bedbe42f..b3a70138 100644 --- a/include/fused_kernel/core/data/ptr_nd.h +++ b/include/fused_kernel/core/data/ptr_nd.h @@ -439,7 +439,7 @@ namespace fk { inline constexpr operator RawPtr() const { return ptr_a; } - inline constexpr Ptr crop(const Point& p, const PtrDims& newDims) { + inline constexpr Ptr crop(const Point p, const PtrDims& newDims) { T* ptr = At::point(p, ptr_a); if (ref) { ref->cnt.fetch_add(1); @@ -564,7 +564,7 @@ namespace fk { inline void download(Stream& stream) {} #endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) - inline T at(const Point& p) const { + inline T at(const Point p) const { if (type != MemType::Device) { return *At::cr_point(p, ptr_pinned); } else { @@ -649,7 +649,7 @@ namespace fk { inline constexpr Ptr1D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr Ptr1D crop1D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr Ptr1D crop1D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; template @@ -666,7 +666,7 @@ namespace fk { inline Ptr2D(T* data_, const uint& width_, const uint& height_, const uint& pitch_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, PtrDims(width_, height_, pitch_), type_, deviceID_) {} - inline Ptr2D crop2D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline Ptr2D crop2D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } inline void Alloc(const fk::Size& size, const uint& pitch_ = 0, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) { this->freePtr(); this->allocPtr(PtrDims(size.width, size.height, pitch_), type_, deviceID_); @@ -685,7 +685,7 @@ namespace fk { inline constexpr Ptr3D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr Ptr3D crop3D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr Ptr3D crop3D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; // A color-plane-transposed 3D pointer PtrT3D @@ -700,7 +700,7 @@ namespace fk { inline constexpr PtrT3D(T* data_, const PtrDims& dims_, const MemType& type_ = defaultMemType, const int& deviceID_ = 0) : Ptr(data_, dims_, type_, deviceID_) {} - inline constexpr PtrT3D crop3D(const Point& p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } + inline constexpr PtrT3D crop3D(const Point p, const PtrDims& newDims) { return Ptr::crop(p, newDims); } }; // A Tensor pointer diff --git a/include/fused_kernel/core/data/rawptr.h b/include/fused_kernel/core/data/rawptr.h index cc05159b..254d52c0 100644 --- a/include/fused_kernel/core/data/rawptr.h +++ b/include/fused_kernel/core/data/rawptr.h @@ -189,12 +189,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return ((const BiggerType*)ptr.data) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)ptr.data + p.x; } }; @@ -202,12 +202,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return (const BiggerType*)((const char*)ptr.data + (p.y * ptr.dims.pitch)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)((char*)ptr.data + (p.y * ptr.dims.pitch)) + p.x; } }; @@ -215,12 +215,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr) { return (const BiggerType*)((const char*)ptr.data + (ptr.dims.plane_pitch * ptr.dims.color_planes * p.z) + (p.y * ptr.dims.pitch)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr) { return (BiggerType*)((char*)ptr.data + (ptr.dims.plane_pitch * ptr.dims.color_planes * p.z) + (p.y * ptr.dims.pitch)) + p.x; } }; @@ -228,12 +228,12 @@ namespace fk { template <> struct PtrAccessor { template - FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point& p, const RawPtr& ptr, const uint& color_plane = 0) { + FK_HOST_DEVICE_FUSE const BiggerType* cr_point(const Point p, const RawPtr& ptr, const uint color_plane = 0) { return (const BiggerType*)((const char*)ptr.data + (color_plane * ptr.dims.color_planes_pitch) + (ptr.dims.plane_pitch * p.z) + (ptr.dims.pitch * p.y)) + p.x; } template - FK_HOST_DEVICE_STATIC BiggerType* point(const Point& p, const RawPtr& ptr, const uint& color_plane = 0) { + FK_HOST_DEVICE_STATIC BiggerType* point(const Point p, const RawPtr& ptr, const uint color_plane = 0) { return (BiggerType*)((char*)ptr.data + (color_plane * ptr.dims.color_planes_pitch) + (ptr.dims.plane_pitch * p.z) + (ptr.dims.pitch * p.y)) + p.x; } }; @@ -244,12 +244,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.x] = value; } }; @@ -257,12 +257,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.y][p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.y][p.x] = value; } }; @@ -270,12 +270,12 @@ namespace fk { template<> struct StaticPtrAccessor { template - FK_HOST_DEVICE_FUSE T read(const Point& p, const StaticRawPtr, T>& ptr) { + FK_HOST_DEVICE_FUSE T read(const Point p, const StaticRawPtr, T>& ptr) { return ptr.data[p.z][p.y][p.x]; } template - FK_HOST_DEVICE_FUSE void write(const Point& p, StaticRawPtr, T>& ptr, const T& value) { + FK_HOST_DEVICE_FUSE void write(const Point p, StaticRawPtr, T>& ptr, const T value) { ptr.data[p.z][p.y][p.x] = value; } }; diff --git a/include/fused_kernel/core/data/rect.h b/include/fused_kernel/core/data/rect.h index 09083c0b..5e361a1c 100644 --- a/include/fused_kernel/core/data/rect.h +++ b/include/fused_kernel/core/data/rect.h @@ -24,8 +24,8 @@ namespace fk { struct Rect_ { P x{ 0 }, y{0}; S width{ 0 }, height{0}; - FK_HOST_DEVICE_CNST Rect_(const Point& point, const Size& size) : x(point.x), y(point.y), width(size.width), height(size.height) {} - FK_HOST_DEVICE_CNST Rect_(const P& x_, const P& y_, const S& width_, const S& height_) : x(x_), y(y_), width(width_), height(height_) {} + FK_HOST_DEVICE_CNST Rect_(const Point point, const Size size) : x(point.x), y(point.y), width(size.width), height(size.height) {} + FK_HOST_DEVICE_CNST Rect_(const P x_, const P y_, const S width_, const S height_) : x(x_), y(y_), width(width_), height(height_) {} FK_HOST_DEVICE_CNST Rect_(){} }; diff --git a/include/fused_kernel/core/execution_model/data_parallel_patterns.h b/include/fused_kernel/core/execution_model/data_parallel_patterns.h index d80122ec..86bba805 100644 --- a/include/fused_kernel/core/execution_model/data_parallel_patterns.h +++ b/include/fused_kernel/core/execution_model/data_parallel_patterns.h @@ -72,24 +72,24 @@ namespace fk { // namespace FusedKernel private: using Details = DPPDetails; - template - FK_HOST_DEVICE_FUSE auto operate(const Point& thread, const T& i_data, const IOpTypes&... iOpInstances) { + template + FK_HOST_DEVICE_FUSE auto operate(const Point thread, const InputType i_data, const IOpTypes&... iOpInstances) { return (InputFoldType(thread, i_data) | ... | iOpInstances).input; } template - FK_HOST_DEVICE_FUSE auto operate_idx(const Point& thread, const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_idx(const Point thread, const InputType input, const IOpTypes&... instantiableOperationInstances) { return operate(thread, TFI::template get(input), instantiableOperationInstances...); } template - FK_HOST_DEVICE_FUSE auto operate_thread_fusion_impl(std::integer_sequence idx, const Point& thread, - const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_thread_fusion_impl(std::integer_sequence idx, const Point thread, + const InputType input, const IOpTypes&... instantiableOperationInstances) { return TFI::make(operate_idx(thread, input, instantiableOperationInstances...)...); } template - FK_HOST_DEVICE_FUSE auto operate_thread_fusion(const Point& thread, const InputType& input, const IOpTypes&... instantiableOperationInstances) { + FK_HOST_DEVICE_FUSE auto operate_thread_fusion(const Point thread, const InputType input, const IOpTypes&... instantiableOperationInstances) { if constexpr (TFI::elems_per_thread == 1) { return operate(thread, input, instantiableOperationInstances...); } else { @@ -98,7 +98,7 @@ namespace fk { // namespace FusedKernel } // We pass TFI as a template parameter because sometimes we need to disable the TF template - FK_HOST_DEVICE_FUSE auto read(const Point& thread, const ReadIOp& readDF) { + FK_HOST_DEVICE_FUSE auto read(const Point thread, const ReadIOp& readDF) { if constexpr (TFI::ENABLED) { static_assert(isAnyReadType, "ReadIOp is not ReadType or ReadBackType"); return ReadIOp::Operation::template exec(thread, readDF); @@ -109,7 +109,7 @@ namespace fk { // namespace FusedKernel template FK_HOST_DEVICE_FUSE - void execute_instantiable_operations_helper(const Point& thread, const ReadIOp& readDF, + void execute_instantiable_operations_helper(const Point thread, const ReadIOp& readDF, const IOps&... iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; @@ -136,12 +136,12 @@ namespace fk { // namespace FusedKernel } template - FK_HOST_DEVICE_FUSE void execute_instantiable_operations(const Point& thread, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_instantiable_operations(const Point thread, const IOps&... iOps) { execute_instantiable_operations_helper(thread, iOps...); } template - FK_HOST_DEVICE_FUSE void execute_thread(const Point& thread, const ActiveThreads& activeThreads, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ActiveThreads& activeThreads, const IOps&... iOps) { using TFI = typename Details::TFI; if constexpr (!TFI::ENABLED) { execute_instantiable_operations(thread, iOps...); @@ -201,27 +201,6 @@ namespace fk { // namespace FusedKernel return Details{}; } } - template - FK_DEVICE_FUSE auto build_details(const ActiveThreads& activeThreads, const uint& readRow, const uint& writeRow) { - using Details = TransformDPPDetails(TFEN), FirstIOp, IOps...>; - using TFI = typename Details::TFI; - if constexpr (TFI::ENABLED) { - const ActiveThreads gridActiveThreads(static_cast(ceil(activeThreads.x / static_cast(TFI::elems_per_thread))), - activeThreads.y, activeThreads.z); - bool threadDivisible; - if constexpr (TFI::ENABLED) { - using ReadOperation = typename FirstIOp::Operation; - using WriteOperation = typename LastType_t::Operation; - threadDivisible = (readRow % TFI::elems_per_thread == 0) && (writeRow % TFI::elems_per_thread == 0); - } else { - threadDivisible = true; - } - const Details details{ gridActiveThreads, threadDivisible }; - return details; - } else { - return Details{}; - } - } }; // Note: there are no ParArch::GPU_NVIDIA_JIT DPP implementaitons, because @@ -310,8 +289,9 @@ namespace fk { // namespace FusedKernel } template - FK_HOST_DEVICE_FUSE void divergent_operate(const uint& z, const InstantiableOperationSequence& iOpSequence, - const IOpSequenceTypes&... iOpSequences) { + FK_HOST_DEVICE_FUSE void divergent_operate(const uint z, + const InstantiableOperationSequence& iOpSequence, + const IOpSequenceTypes&... iOpSequences) { if (OpSequenceNumber == SequenceSelector::at(z)) { apply(launchTransformDPP, iOpSequence.iOps); } else if constexpr (sizeof...(iOpSequences) > 0) { diff --git a/include/fused_kernel/core/execution_model/operation_model/batch_operations.h b/include/fused_kernel/core/execution_model/operation_model/batch_operations.h index 8423fbba..e6ff8cb8 100644 --- a/include/fused_kernel/core/execution_model/operation_model/batch_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/batch_operations.h @@ -51,8 +51,8 @@ ReadBackOperation>: ReadBackOperation: Instantiable - Implements exec(const Point& thread, const OperationData>& opData) - Implements exec(const Point& thread, const ParamsType& params, const BackIOp& backIOp) + Implements exec(const Point thread, const OperationData>& opData) + Implements exec(const Point thread, const ParamsType& params, const BackIOp& backIOp) Implements build(const OperationData>& opData) Implements build(const ParamsType& params, const BackIOp& backIOp) @@ -156,8 +156,8 @@ namespace fk { BatchRead: Instantiable ParamsType = BatchReadParams - Implements exec(const Point& thread, const OperationDataType& opData) - Implements exec(const Point& thread, const ParamsType& params) + Implements exec(const Point thread, const OperationDataType& opData) + Implements exec(const Point thread, const ParamsType& params) Implements build(const OperationDataType& opData) Implements build(const ParamsType& params) */ @@ -182,16 +182,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = Operation::THREAD_FUSION; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -199,11 +199,11 @@ namespace fk { } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { return exec(thread, opData.params); } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) { if constexpr (THREAD_FUSION) { return Operation::template exec(thread, params.opData[thread.z]); } else { @@ -239,16 +239,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = false; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -256,11 +256,11 @@ namespace fk { } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { return exec(thread, opData.params); } template - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const ParamsType& params) { if (params.usedPlanes <= thread.z) { return params.default_value; } else { @@ -304,16 +304,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = Operation::THREAD_FUSION; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -352,16 +352,16 @@ namespace fk { static constexpr bool IS_FUSED_OP = Operation::IS_FUSED_OP; static constexpr bool THREAD_FUSION = false; - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return Operation::num_elems_y(thread, opData.params.opData[thread.z]); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return BATCH; } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params.opData[thread.z]); } FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const OperationDataType& opData) { @@ -472,8 +472,8 @@ namespace fk { DECLARE_WRITE_PARENT_BASIC template - FK_HOST_DEVICE_FUSE void exec(const Point& thread, - const ThreadFusionType& input, + FK_HOST_DEVICE_FUSE void exec(const Point thread, + const ThreadFusionType input, const ParamsType& params) { if constexpr (THREAD_FUSION) { Operation::template exec(thread, input, params[thread.z]); @@ -481,10 +481,10 @@ namespace fk { Operation::exec(thread, input, params[thread.z]); } } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return Operation::num_elems_x(thread, opData.params[thread.z]); } - FK_HOST_DEVICE_FUSE uint pitch(const Point& thread, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE uint pitch(const Point thread, const OperationDataType& opData) { return Operation::pitch(thread, opData.params[thread.z]); } // Build WriteBatch from array of IOps diff --git a/include/fused_kernel/core/execution_model/operation_model/fused_operation.h b/include/fused_kernel/core/execution_model/operation_model/fused_operation.h index 1394f947..7bbcda13 100644 --- a/include/fused_kernel/core/execution_model/operation_model/fused_operation.h +++ b/include/fused_kernel/core/execution_model/operation_model/fused_operation.h @@ -55,14 +55,14 @@ namespace fk { DECLARE_OPEN_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const InputType input, const ParamsType& params) { return exec_helper(std::make_index_sequence{}, thread, input, params); } private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const Point& thread, - const InputType& input, + const Point thread, + const InputType input, const ParamsType& params) { return (InputFoldType{thread, input} | ... | get_opt(params)).input; } @@ -88,21 +88,21 @@ namespace fk { DECLARE_READ_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const ParamsType& params) { return exec_helper(std::make_index_sequence{}, thread, params); } - FK_HOST_DEVICE_FUSE uint num_elems_x(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_x(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_x(thread, get_opt<0>(opData.params)); } - FK_HOST_DEVICE_FUSE uint num_elems_y(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_y(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_y(thread, get_opt<0>(opData.params)); } - FK_HOST_DEVICE_FUSE uint num_elems_z(const Point& thread, + FK_HOST_DEVICE_FUSE uint num_elems_z(const Point thread, const OperationDataType& opData) { return FirstType_t::Operation::num_elems_z(thread, get_opt<0>(opData.params)); } @@ -114,7 +114,7 @@ namespace fk { private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const Point& thread, + const Point thread, const ParamsType& params) { return (thread | ... | get_opt(params)).input; } @@ -131,13 +131,13 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_UNARY_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { return exec_helper(std::make_index_sequence::size>{}, input); } private: template - FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, const InputType input) { constexpr OperationTuple poTup{}; // Optimization, we use a version of operator| that does not use InputTypeFold, // thus it does not propagate Point thread, because it is not needed. @@ -158,13 +158,13 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_CLOSED_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const ParamsType ¶ms) { exec_helper(std::make_index_sequence{}, thread, params); } private: template - FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence&, const Point &thread, + FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence&, const Point thread, const ParamsType ¶ms) { LastType_t::Operation::exec(thread, (thread | ... | get_opt(params)).input, @@ -189,14 +189,14 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_WRITE_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const InputType& input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec(const Point thread, const InputType input, const ParamsType ¶ms) { exec_helper(std::make_index_sequence{}, thread, input, params); } private: template - FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence &, const Point &thread, - const InputType& input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE void exec_helper(const std::index_sequence &, const Point thread, + const InputType input, const ParamsType ¶ms) { LastType_t::Operation::exec( thread, (InputFoldType<>::build(thread, input) | ... | get_opt(params)).input, get_opt(params)); @@ -217,14 +217,14 @@ namespace fk { FK_STATIC_STRUCT(FusedOperation_, SelfType) DECLARE_BINARY_PARENT using Operations = TypeList; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const ParamsType ¶ms) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType ¶ms) { return exec_helper(std::make_index_sequence{}, input, params); } private: template FK_HOST_DEVICE_FUSE OutputType exec_helper(const std::index_sequence&, - const InputType& input, const ParamsType& params) { + const InputType input, const ParamsType& params) { // Optimization, we use a version of operator| that does not use InputTypeFold, // thus it does not propagate Point thread, because it is not needed. return (input | ... | get_opt(params)); diff --git a/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h b/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h index afe55b3d..d821603a 100644 --- a/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/instantiable_operations.h @@ -64,14 +64,14 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c Point thread; InputType input; - FK_HOST_DEVICE_CNST InputFoldType(const Point& thread_, const InputType& input_) + FK_HOST_DEVICE_CNST InputFoldType(const Point thread_, const InputType input_) : thread(thread_), input(input_) {} }; template <> struct InputFoldType { template - FK_HOST_DEVICE_FUSE auto build(const Point& thread, InputT&& input) { + FK_HOST_DEVICE_FUSE auto build(const Point thread, InputT&& input) { return InputFoldType>(thread, std::forward(input)); } }; @@ -84,7 +84,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c return Operation::getActiveThreads(*this); } - FK_HOST_DEVICE_CNST friend auto operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend auto operator|(const Point thread, const OperationData& opData) { return InputFoldType<>::build(thread, Operation::exec(thread, opData)); } @@ -107,7 +107,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c return Fuser_t::fuse(std::forward(prevIOp), self); } - FK_HOST_DEVICE_CNST friend auto operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend auto operator|(const Point thread, const OperationData& opData) { return InputFoldType<>::build(thread, Operation::exec(thread, opData)); } }; @@ -134,7 +134,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * It can be composed of a single Operation or of a chain of Operations, in which case it wraps them into an * FusedOperation. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input, const OperationData& opDat) + * OutputType exec(const InputType input, const OperationData& opDat) */ template struct BinaryInstantiableOperation final : public OperationData { @@ -164,7 +164,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * Third parameter (back_function): it's a IOp that will be used at some point in the implementation of the * Operation. It can be any kind of IOp. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input, const OperationData& opData) + * OutputType exec(const InputType input, const OperationData& opData) */ template struct TernaryInstantiableOperation final : public OperationData { @@ -194,7 +194,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c * It allows to execute the Operation (or chain of Unary Operations) on the input, and returns the result as output * in register memory. * Expects Operation_t to have an static __device__ function member with the following parameters: - * OutputType exec(const InputType& input) + * OutputType exec(const InputType input) */ template struct UnaryInstantiableOperation { @@ -275,7 +275,7 @@ FK_HOST_CNST auto then(const ContinuationIOp& cIOp, const ContinuationIOps&... c struct ClosedInstantiableOperation final : public OperationData { INSTANTIABLE_OPERATION_DETAILS_IS_ASSERT(ClosedType) - FK_HOST_DEVICE_CNST friend void operator|(const Point& thread, const OperationData& opData) { + FK_HOST_DEVICE_CNST friend void operator|(const Point thread, const OperationData& opData) { Operation::exec(thread, opData); } }; diff --git a/include/fused_kernel/core/execution_model/operation_model/parent_operations.h b/include/fused_kernel/core/execution_model/operation_model/parent_operations.h index 9882c0c7..c5c814f7 100644 --- a/include/fused_kernel/core/execution_model/operation_model/parent_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/parent_operations.h @@ -78,7 +78,7 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType &opData) { \ return exec(input, opData.params); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType& opData) { return {opData}; } \ @@ -110,7 +110,7 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const InputType &input, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType &opData) { \ return exec(input, opData.params, opData.backIOp); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return {opData}; } \ @@ -146,7 +146,7 @@ namespace fk { static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ template \ - FK_HOST_DEVICE_FUSE auto exec(const Point& thread, const OperationDataType& opData) { \ + FK_HOST_DEVICE_FUSE auto exec(const Point thread, const OperationDataType& opData) { \ if constexpr (std::bool_constant::value) { \ return exec(thread, opData.params); \ } else { \ @@ -186,7 +186,7 @@ namespace fk { static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ template \ - FK_HOST_DEVICE_FUSE void exec(const Point &thread, \ + FK_HOST_DEVICE_FUSE void exec(const Point thread, \ const ThreadFusionType &input, \ const OperationDataType &opData) { \ if constexpr (THREAD_FUSION) { \ @@ -222,8 +222,8 @@ namespace fk { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE OutputType exec(const Point& thread, \ - const InputType& input, \ + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, \ + const InputType input, \ const OperationDataType& opData) { \ return exec(thread, input, opData.params); \ } \ @@ -251,7 +251,7 @@ struct ClosedOperation { using OperationDataType = typename Parent::OperationDataType; \ using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ - FK_HOST_DEVICE_FUSE void exec(const Point &thread, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE void exec(const Point thread, const OperationDataType &opData) { \ exec(thread, opData.params); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return { opData }; } \ @@ -285,7 +285,7 @@ struct ClosedOperation { using InstantiableType = typename Parent::InstantiableType; \ static constexpr bool IS_FUSED_OP = Parent::IS_FUSED_OP; \ static constexpr bool THREAD_FUSION = Parent::THREAD_FUSION; \ - FK_HOST_DEVICE_FUSE OutputType exec(const Point &thread, const OperationDataType &opData) { \ + FK_HOST_DEVICE_FUSE OutputType exec(const Point thread, const OperationDataType &opData) { \ return exec(thread, opData.params, opData.backIOp); \ } \ FK_HOST_FUSE InstantiableType build(const OperationDataType &opData) { return {opData}; } \ @@ -328,22 +328,22 @@ struct ClosedOperation { struct NumElems { template - FK_HOST_DEVICE_FUSE uint x(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint x(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::x"); return IOp::Operation::num_elems_x(thread, iOp); } template - FK_HOST_DEVICE_FUSE uint y(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint y(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::y"); return IOp::Operation::num_elems_y(thread, iOp); } template - FK_HOST_DEVICE_FUSE Size size(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE Size size(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::size"); return Size(x(thread, iOp), y(thread, iOp)); } template - FK_HOST_DEVICE_FUSE uint z(const Point& thread, const IOp& iOp) { + FK_HOST_DEVICE_FUSE uint z(const Point thread, const IOp& iOp) { static_assert(isAnyReadType || opIs, "Only Read, ReadBack, IncompleteReadBack and Ternary Types work with NumElems::z"); return IOp::Operation::num_elems_z(thread, iOp); } diff --git a/include/fused_kernel/core/execution_model/operation_model/vector_operations.h b/include/fused_kernel/core/execution_model/operation_model/vector_operations.h index c223fbe5..228c9b85 100644 --- a/include/fused_kernel/core/execution_model/operation_model/vector_operations.h +++ b/include/fused_kernel/core/execution_model/operation_model/vector_operations.h @@ -32,7 +32,7 @@ namespace fk { using InputType = I; using OutputType = O; using InstanceType = UnaryType; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { static_assert(cn == cn, "Unary struct requires same number of channels for input and output types."); constexpr bool allCUDAOrNotCUDA = @@ -73,7 +73,7 @@ namespace fk { using InputType = I; using OutputType = O; using InstanceType = UnaryType; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input) { const auto input1 = get<0>(input); const auto input2 = get<1>(input); using I1 = get_t<0, I>; @@ -138,10 +138,10 @@ namespace fk { using ParamsType = P; using InstanceType = BinaryType; using OperationDataType = OperationData>; - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const OperationDataType& opData) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const OperationDataType& opData) { return BinaryV::exec(input, opData.params); } - FK_HOST_DEVICE_FUSE OutputType exec(const InputType& input, const ParamsType& params) { + FK_HOST_DEVICE_FUSE OutputType exec(const InputType input, const ParamsType& params) { static_assert(cn == cn, "Binary struct requires same number of channels for input and output types."); constexpr bool allCUDAOrNotCUDA = diff --git a/tests/examples/inlining_and_LDL_STL.h b/tests/examples/inlining_and_LDL_STL.h index a465ca56..184badb7 100644 --- a/tests/examples/inlining_and_LDL_STL.h +++ b/tests/examples/inlining_and_LDL_STL.h @@ -57,7 +57,7 @@ struct SimpleTransformDPPBaseReference { friend struct SimpleTransformDPPReference; // Allow TransformDPP to access private members private: template - FK_HOST_DEVICE_FUSE void execute_thread(const Point& thread, const ReadIOp& readDF, const IOps&... iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ReadIOp& readDF, const IOps&... iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; @@ -82,7 +82,7 @@ struct SimpleTransformDPPBaseReferenceFoldExpr { // to access private members private: template - FK_HOST_DEVICE_FUSE void execute_thread(const Point &thread, const ReadIOp &readDF, const IOps &...iOps) { + FK_HOST_DEVICE_FUSE void execute_thread(const Point thread, const ReadIOp &readDF, const IOps &...iOps) { using ReadOperation = typename ReadIOp::Operation; using WriteOperation = typename LastType_t::Operation; diff --git a/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h b/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h index 58bebef8..ccd3e8d5 100644 --- a/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h +++ b/utests/algorithm/image_processing/utest_saturate/utest_saturate_common.h @@ -24,7 +24,7 @@ constexpr T halfPositiveRange() { } template -constexpr OutputType expectedPositiveValue(const InputType &input) { +constexpr OutputType expectedPositiveValue(const InputType input) { if (cxp::cmp_greater::f(input, fk::maxValue>)) { return fk::maxValue; } else {