diff --git a/CMakeLists.txt b/CMakeLists.txt index c45fca3b72..b5beb75898 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,6 +177,7 @@ set(BOUT_SOURCES ./include/bout/sys/range.hxx ./include/bout/sys/timer.hxx ./include/bout/sys/type_name.hxx + ./include/bout/twiddle.hxx ./include/bout/sys/uncopyable.hxx ./include/bout/sys/uuid.h ./include/bout/sys/variant.hxx @@ -239,7 +240,7 @@ set(BOUT_SOURCES ./include/bout/invert/laplacexy2.hxx ./src/invert/laplacexy2/laplacexy2.cxx ./include/bout/invert/laplacexy2_hypre.hxx - ./src/invert/laplacexy2/laplacexy2_hypre.cxx + ./src/invert/laplacexy2/laplacexy2_hypre.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.hxx ./src/invert/laplacexz/impls/petsc/laplacexz-petsc.cxx @@ -386,8 +387,15 @@ if (BOUT_GENERATE_FIELDOPS) if (NOT ClangFormat_FOUND) message(FATAL_ERROR "clang-format not found, but you have requested to generate code!") endif() + if (BOUT_ENABLE_RAJA) + set(GEN_LOOP_EXEC "raja") + elseif (BOUT_ENABLE_OPENMP) + set(GEN_LOOP_EXEC "openmp") + else() + set(GEN_LOOP_EXEC "serial") + endif() add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/field/generated_fieldops.cxx - COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --filename generated_fieldops.cxx.tmp + COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --loop-exec ${GEN_LOOP_EXEC} --filename generated_fieldops.cxx.tmp COMMAND ${ClangFormat_BIN} generated_fieldops.cxx.tmp -i COMMAND ${CMAKE_COMMAND} -E rename generated_fieldops.cxx.tmp generated_fieldops.cxx DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.jinja ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.py @@ -518,7 +526,7 @@ if (BOUT_ENABLE_WARNINGS) $<$,$,$>: -Wall -Wextra > > $<$: - /W4 > + /W4 > $<$:-Xcompiler=-Wall -Xcompiler=-Wextra > ) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 8e84901806..96c28bab12 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1031,7 +1031,8 @@ class ELMpb : public PhysicsModel { vacuum_trans *= pnorm; // Transitions from 0 in core to 1 in vacuum - vac_mask = (1.0 - tanh((P0 - vacuum_pressure) / vacuum_trans)) / 2.0; + Field2D tanh_res = tanh((P0 - vacuum_pressure) / vacuum_trans); + vac_mask = (1.0 - tanh_res) / 2.0; if (spitzer_resist) { // Use Spitzer resistivity @@ -1213,7 +1214,7 @@ class ELMpb : public PhysicsModel { // Only if not restarting: Check initial perturbation // Set U to zero where P0 < vacuum_pressure - U = where(P0 - vacuum_pressure, U, 0.0); + U = where(Field2D{P0 - vacuum_pressure}, U, 0.0); if (constn0) { ubyn = U; @@ -1840,7 +1841,8 @@ class ELMpb : public PhysicsModel { ddt(U) -= 0.5 * Upara2 * bracket(Pi0, Dperp2Phi, bm_exb) / B0; Field3D B0phi = B0 * phi; mesh->communicate(B0phi); - Field3D B0phi0 = B0 * phi0; + Field2D res = B0 * phi0; + Field3D B0phi0 = res; mesh->communicate(B0phi0); ddt(U) += 0.5 * Upara2 * bracket(B0phi, Dperp2Pi0, bm_exb) / B0; ddt(U) += 0.5 * Upara2 * bracket(B0phi0, Dperp2Pi, bm_exb) / B0; diff --git a/include/bout/array.hxx b/include/bout/array.hxx index 2c42f15aad..4965aee880 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -67,6 +67,7 @@ struct ArrayData { auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA auto allocator = rm.getAllocator(umpire::resource::Pinned); + //auto allocator = rm.getAllocator(umpire::resource::Unified); #else auto allocator = rm.getAllocator("HOST"); #endif diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 653c44ed42..954ae8dba0 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -40,6 +40,7 @@ if (!(condition)) { \ throw BoutException("Assertion failed in {:s}, line {:d}: {:s}", __FILE__, __LINE__, \ #condition); \ + abort(); \ } #else // CHECKLEVEL >= 1 #define ASSERT1(condition) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index c1f06fca7c..03bc4dcee4 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -140,4 +140,16 @@ struct enumWrapper { /// Boundary condition function using FuncPtr = BoutReal (*)(BoutReal t, BoutReal x, BoutReal y, BoutReal z); +template +struct Constant { + T val; + struct View { + T v; + cudaStream_t stream = 0; + View(T v) : v(v) {} + __host__ __device__ T operator()(int) const { return v; } + }; + operator View() const { return {val}; } +}; + #endif // BOUT_TYPES_H diff --git a/include/bout/coordinates_accessor.hxx b/include/bout/coordinates_accessor.hxx index 532351d57a..2376ab5039 100644 --- a/include/bout/coordinates_accessor.hxx +++ b/include/bout/coordinates_accessor.hxx @@ -31,7 +31,7 @@ /// -> If Coordinates data is changed, the cache should be cleared /// by calling CoordinatesAccessor::clear() struct CoordinatesAccessor { - CoordinatesAccessor() = delete; + CoordinatesAccessor() {} /// Constructor from Coordinates /// Copies data from coords, doesn't modify it diff --git a/include/bout/field.hxx b/include/bout/field.hxx index 188d529ef0..fe2b4767d2 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -44,6 +44,8 @@ class Field; #include #include +#include "bout/fieldops.hxx" + class Mesh; /// Base class for scalar fields @@ -327,6 +329,12 @@ inline BoutReal min(const T& f, bool allpe = false, return result; } +template +inline BoutReal min(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return min(ResT{f}, allpe, rgn); +} + /// Returns true if all elements of \p f over \p region are equal. By /// default only checks the local processor, use \p allpe to check /// globally @@ -412,6 +420,12 @@ inline BoutReal max(const T& f, bool allpe = false, return result; } +template +inline BoutReal max(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return max(ResT{f}, allpe, rgn); +} + /// Mean of \p f, excluding the boundary/guard cells by default (can /// be changed with \p rgn argument). /// @@ -519,17 +533,33 @@ T pow(BoutReal lhs, const T& rhs, const std::string& rgn = "RGN_ALL") { #ifdef FIELD_FUNC #error This macro has already been defined #else -#define FIELD_FUNC(name, func) \ - template > \ - inline T name(const T& f, const std::string& rgn = "RGN_ALL") { \ - AUTO_TRACE(); \ - /* Check if the input is allocated */ \ - checkData(f); \ - /* Define and allocate the output result */ \ - T result{emptyFrom(f)}; \ - BOUT_FOR(d, result.getRegion(rgn)) { result[d] = func(f[d]); } \ - checkData(result); \ - return result; \ +#define FIELD_FUNC(name, func) \ + namespace bout::op { \ + struct name { \ + template \ + __host__ __device__ BoutReal operator()(int idx, const LView& L, \ + const RView& R) const { \ + return func(L(idx)); \ + } \ + }; \ + }; \ + template > \ + inline BinaryExpr name(const T& f, \ + const std::string& rgn = "RGN_ALL") { \ + std::cout << "RUNNING " #name " with CUDA\n"; \ + return BinaryExpr{static_cast(f), \ + static_cast(f), \ + bout::op::name{}, \ + f.getMesh(), \ + f.getLocation(), \ + f.getDirections(), \ + std::nullopt, \ + f.getRegion(rgn)}; \ + } \ + template \ + inline BinaryExpr name( \ + const BinaryExpr& f, const std::string& rgn = "RGN_ALL") { \ + return name(ResT{f}, rgn); \ } #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 92658f1bbf..b452df7fd6 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -38,6 +38,8 @@ class Field2D; #include "bout/region.hxx" #include "bout/unused.hxx" +#include "bout/fieldops.hxx" + #if BOUT_HAS_RAJA #include "RAJA/RAJA.hpp" // using RAJA lib #endif @@ -45,6 +47,15 @@ class Field2D; class Field3D; class Mesh; +template +struct is_expr_field2d> + : std::integral_constant> + && is_expr_field2d_v>) + || (is_expr_constant_v> + && is_expr_field2d_v>) + || (is_expr_field2d_v> + && is_expr_constant_v>)> {}; + /*! * \brief 2D X-Y scalar fields * @@ -91,6 +102,18 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Average}); + template < + typename ResT, typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) + || (is_expr_constant_v && is_expr_field2d_v) + || (is_expr_field2d_v && is_expr_constant_v)>> + Field2D(const BinaryExpr& expr) { + std::cout << "RUNNING Field2D constructor with CUDA\n"; + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + } /*! * Destructor */ @@ -166,6 +189,18 @@ public: */ Field2D& operator=(BoutReal rhs); + template + std::enable_if_t, Field2D&> + operator=(const BinaryExpr& expr) { + std::cout << "RUNNING Field2D operator= with CUDA\n"; + if (isAllocated()) { + expr.evaluate(&data[0]); + } else { + *this = Field2D{expr}; + } + return *this; + } + ///////////////////////////////////////////////////////// // Data access @@ -235,22 +270,23 @@ public: return operator()(jx, jy); } - /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(const Field2D& rhs); - /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(BoutReal rhs); - /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(const Field2D& rhs); - /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(BoutReal rhs); - /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(const Field2D& rhs); - /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(BoutReal rhs); - /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(const Field2D& rhs); - /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(BoutReal rhs); +#define FIELD2D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t || is_expr_constant_v, Field2D&> \ + operator OP_SYM## = (R rhs) { \ + if (data.unique()) { \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ + } + + FIELD2D_OP_EQUALS(+) + FIELD2D_OP_EQUALS(-) + FIELD2D_OP_EQUALS(*) + FIELD2D_OP_EQUALS(/) // FieldData virtual functions @@ -276,6 +312,29 @@ public: int size() const override { return nx * ny; }; + struct View { + BoutReal* data; + int mul = 1; + int div = 1; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul / div)]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul)/div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } + + __device__ inline BoutReal operator()(int i) { return View()(i); } + __device__ inline BoutReal operator()(int i) const { return View()(i); } + private: /// Internal data array. Handles allocation/freeing of memory Array data; @@ -289,25 +348,91 @@ private: // Non-member overloaded operators -Field2D operator+(const Field2D& lhs, const Field2D& rhs); -Field2D operator-(const Field2D& lhs, const Field2D& rhs); -Field2D operator*(const Field2D& lhs, const Field2D& rhs); -Field2D operator/(const Field2D& lhs, const Field2D& rhs); - -Field3D operator+(const Field2D& lhs, const Field3D& rhs); -Field3D operator-(const Field2D& lhs, const Field3D& rhs); -Field3D operator*(const Field2D& lhs, const Field3D& rhs); -Field3D operator/(const Field2D& lhs, const Field3D& rhs); - -Field2D operator+(const Field2D& lhs, BoutReal rhs); -Field2D operator-(const Field2D& lhs, BoutReal rhs); -Field2D operator*(const Field2D& lhs, BoutReal rhs); -Field2D operator/(const Field2D& lhs, BoutReal rhs); - -Field2D operator+(BoutReal lhs, const Field2D& rhs); -Field2D operator-(BoutReal lhs, const Field2D& rhs); -Field2D operator*(BoutReal lhs, const Field2D& rhs); -Field2D operator/(BoutReal lhs, const Field2D& rhs); +#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_FIELD2D_FIELD2D_OP(+, Add) +FIELD2D_FIELD2D_FIELD2D_OP(-, Sub) +FIELD2D_FIELD2D_FIELD2D_OP(*, Mul) +FIELD2D_FIELD2D_FIELD2D_OP(/, Div) + +#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + int mesh_nz = rhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs).setScale(1, mesh_nz), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_FIELD2D_FIELD3D_OP(+, Add) +FIELD3D_FIELD2D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD2D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD2D_FIELD3D_OP(/, Div) + +#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_FIELD2D_BOUTREAL_OP(+, Add) +FIELD2D_FIELD2D_BOUTREAL_OP(-, Sub) +FIELD2D_FIELD2D_BOUTREAL_OP(*, Mul) +FIELD2D_FIELD2D_BOUTREAL_OP(/, Div) + +#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(L lhs, const R & rhs) { \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + std::nullopt, \ + rhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_BOUTREAL_FIELD2D_OP(+, Add) +FIELD2D_BOUTREAL_FIELD2D_OP(-, Sub) +FIELD2D_BOUTREAL_FIELD2D_OP(*, Mul) +FIELD2D_BOUTREAL_FIELD2D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index a75e38df36..9dc064d6e6 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact: Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -39,6 +39,8 @@ class Field3D; class Mesh; +#include "bout/fieldops.hxx" + /// Class for 3D X-Y-Z scalar fields /*! This class represents a scalar field defined over the mesh. @@ -183,6 +185,16 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + template || is_expr_field3d_v>> + Field3D(const BinaryExpr& expr) { + //std::cout << "RUNNING constructor from BinaryExpr\n"; + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + setRegion(expr.getRegionID()); + } /// Destructor ~Field3D() override; @@ -343,7 +355,7 @@ public: * Direct access to the underlying data array * * If CHECK > 2 then bounds checking is performed - * + * * If CHECK <= 2 then no checks are performed, to * allow inlining and optimisation of inner loops */ @@ -413,6 +425,27 @@ public: return &data[(jx * ny + jy) * nz]; } + struct View { + BoutReal* data; + int mul = 1; + int div = 1; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } + //operator View() const { return View{&data[0]}; } + ///////////////////////////////////////////////////////// // Operators @@ -424,34 +457,42 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - ///@} + template + std::enable_if_t, Field3D&> + operator=(BinaryExpr& expr) { + std::cout << "RUNNING operator= with CUDA\n"; + regionID = expr.getRegionID(); + if(isAllocated()) { + expr.evaluate(&data[0]); + } + else { + *this = Field3D{expr}; + } + return *this; + } - /// Addition operators - ///@{ - Field3D& operator+=(const Field3D& rhs); - Field3D& operator+=(const Field2D& rhs); - Field3D& operator+=(BoutReal rhs); ///@} - /// Subtraction operators - ///@{ - Field3D& operator-=(const Field3D& rhs); - Field3D& operator-=(const Field2D& rhs); - Field3D& operator-=(BoutReal rhs); - ///@} +#define FIELD3D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t< \ + is_expr_field3d_v || is_expr_field2d_v || is_expr_constant_v, Field3D&> \ + operator OP_SYM## = (const R& rhs) { \ + if (data.unique()) { \ + clearParallelSlices(); \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ + } - /// Multiplication operators - ///@{ - Field3D& operator*=(const Field3D& rhs); - Field3D& operator*=(const Field2D& rhs); - Field3D& operator*=(BoutReal rhs); - ///@} + FIELD3D_OP_EQUALS(+) + FIELD3D_OP_EQUALS(-) + FIELD3D_OP_EQUALS(*) + FIELD3D_OP_EQUALS(/) - /// Division operators - ///@{ - Field3D& operator/=(const Field3D& rhs); - Field3D& operator/=(const Field2D& rhs); - Field3D& operator/=(BoutReal rhs); ///@} // FieldData virtual functions @@ -512,31 +553,105 @@ private: // Non-member overloaded operators +template +constexpr bool always_false = false; + // Binary operators FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -Field3D operator+(const Field3D& lhs, const Field3D& rhs); -Field3D operator-(const Field3D& lhs, const Field3D& rhs); -Field3D operator*(const Field3D& lhs, const Field3D& rhs); -Field3D operator/(const Field3D& lhs, const Field3D& rhs); +#define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template && is_expr_field3d_v>> \ + BinaryExpr operator OP_SYM(const L& lhs, \ + const R& rhs) { \ + auto regionID = \ + lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) \ + : lhs.getMesh()->getRegion("RGN_ALL"))}; \ + } + +FIELD3D_FIELD3D_FIELD3D_OP(+, Add) +FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD3D_OP(/, Div) + +#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L& lhs, const R& rhs) { \ + auto regionID = lhs.getRegionID(); \ + int mesh_nz = lhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs).setScale(1, mesh_nz), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } -Field3D operator+(const Field3D& lhs, const Field2D& rhs); -Field3D operator-(const Field3D& lhs, const Field2D& rhs); -Field3D operator*(const Field3D& lhs, const Field2D& rhs); -Field3D operator/(const Field3D& lhs, const Field2D& rhs); +FIELD3D_FIELD3D_FIELD2D_OP(+, Add) +FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD2D_OP(/, Div) + +#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L& lhs, R rhs) { \ + auto regionID = lhs.getRegionID(); \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } -Field3D operator+(const Field3D& lhs, BoutReal rhs); -Field3D operator-(const Field3D& lhs, BoutReal rhs); -Field3D operator*(const Field3D& lhs, BoutReal rhs); -Field3D operator/(const Field3D& lhs, BoutReal rhs); +FIELD3D_FIELD3D_BOUTREAL_OP(+, Add) +FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) +FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) +FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) + +#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(const L& lhs, const R& rhs) { \ + auto regionID = rhs.getRegionID(); \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } -Field3D operator+(BoutReal lhs, const Field3D& rhs); -Field3D operator-(BoutReal lhs, const Field3D& rhs); -Field3D operator*(BoutReal lhs, const Field3D& rhs); -Field3D operator/(BoutReal lhs, const Field3D& rhs); +FIELD3D_BOUTREAL_FIELD3D_OP(+, Add) +FIELD3D_BOUTREAL_FIELD3D_OP(-, Sub) +FIELD3D_BOUTREAL_FIELD3D_OP(*, Mul) +FIELD3D_BOUTREAL_FIELD3D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, @@ -567,7 +682,7 @@ void checkData(const Field3D& f, const std::string& region = "RGN_NOBNDRY"); /// Ignored with disabled CHECK; Throw an exception if \p f is not /// allocated or if any elements are non-finite (for CHECK > 2) inline void checkData(const Field3D& UNUSED(f), - const std::string& UNUSED(region) = "RGN_NOBNDRY"){}; + const std::string& UNUSED(region) = "RGN_NOBNDRY") {}; #endif /// Fourier filtering, removes all except one mode @@ -650,4 +765,16 @@ bool operator==(const Field3D& a, const Field3D& b); /// Output a string describing a Field3D to a stream std::ostream& operator<<(std::ostream& out, const Field3D& value); +// A raw Field3D is an expression leaf +template <> +struct is_expr_field3d : std::true_type {}; + +template <> +struct is_expr_field2d : std::true_type {}; + +template +struct is_expr_field3d> + : std::integral_constant>::value + || is_expr_field3d_v>> {}; + #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index 69b58da979..a43420d6b3 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -57,10 +57,17 @@ struct FieldAccessor { /// Constructor from Field3D /// /// @param[in] f The field to access. Must already be allocated - explicit FieldAccessor(FieldType& f) : coords(f.getCoordinates()) { + explicit FieldAccessor(FieldType& f) { ASSERT0(f.getLocation() == location); ASSERT0(f.isAllocated()); + if (auto* Coords = f.getCoordinates()) { + coords = CoordinatesAccessor{Coords}; + } + else { + coords = CoordinatesAccessor{}; + } + data = BoutRealArray{&f(0, 0, 0)}; // Field size @@ -81,15 +88,20 @@ struct FieldAccessor { ddt = BoutRealArray{&(f.timeDeriv()->operator()(0, 0, 0))}; } + explicit FieldAccessor(const FieldType& f) : FieldAccessor(const_cast(f)) {} + /// Provide shorthand for access to field data. /// Does not convert between 3D and 2D indices, /// so fa[i] is equivalent to fa.data[i]. /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } + __device__ inline BoutReal operator()(int i) const { return data[i]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](const Ind3D& ind) { return data[ind.ind]; } // Pointers to the field data arrays // These are wrapped in BoutRealArray types so they can be indexed with Ind3D or int @@ -115,6 +127,9 @@ struct FieldAccessor { template using Field2DAccessor = FieldAccessor; +template +using Field3DAccessor = FieldAccessor; + /// Syntactic sugar for time derivative of a field /// /// Usage: @@ -130,4 +145,28 @@ BOUT_HOST_DEVICE inline BoutRealArray& ddt(const FieldAccessor(fa.ddt); } +struct FieldPerpAccessor { + FieldPerpAccessor() = delete; + + int nx, nz; + int yindex; + BoutReal* data; + + explicit FieldPerpAccessor(const FieldPerp& f) { + ASSERT0(f.isAllocated()); + + data = BoutRealArray{const_cast(&f(0, 0, 0))}; + + // Field size + nx = f.getNx(); + nz = f.getNz(); + + yindex = f.getIndex(); + } + + BOUT_HOST_DEVICE int getIndex() const { return yindex; } + BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } +}; + #endif diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx new file mode 100644 index 0000000000..114793ebc8 --- /dev/null +++ b/include/bout/fieldops.hxx @@ -0,0 +1,282 @@ +#pragma once +#ifndef BOUT_FIELDOPS_HXX +#define BOUT_FIELDOPS_HXX + +#include "bout/array.hxx" +#include "bout/bout_types.hxx" + +#include +#include +#include +#include + +class Mesh; +class Field3D; +class Field2D; + +template +struct is_expr_field2d : std::false_type {}; + +template +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; + +// Base template: nothing is an expression by default +template +struct is_expr_field3d : std::false_type {}; + +template +struct is_expr_fieldperp : std::false_type {}; + +template +inline constexpr bool is_expr_fieldperp_v = is_expr_fieldperp>::value; + +// Helper variable template +template +inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; + +template +struct is_expr_constant : std::bool_constant> {}; + +template +inline constexpr bool is_expr_constant_v = is_expr_constant>::value; + +template +struct is_expr_constant> + : std::integral_constant>> {}; + +constexpr int THREADS = 128; +namespace bout { +namespace op { +struct Assign { + int scale = 1; + int offset = 0; + template + __host__ __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); + } +}; + +struct Add { + template + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) + R(idx); + } + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a + b; + } +}; + struct Sub { + template + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) - R(idx); + } + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { + return a - b; + } + }; + struct Mul { + template + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) * R(idx); + } + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { + return a * b; + } + }; + struct Div { + template + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) / R(idx); + } + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { + return a / b; + } + }; +}; +}; + +template +__global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int e = expr.size(); + + // In-bounds version + //if (tid < e) { + // int idx = expr.regionIdx(tid); + // out[idx] = expr(idx); // single‐pass fusion + //} + + // Out-of-bounds version + if (tid >= e) { + return; + } + int idx = expr.regionIdx(tid); + out[idx] = expr(idx); // single‐pass fusion + + // Grid-strided loop + //int stride = blockDim.x * gridDim.x; + //for (int i = tid; i < e; i += stride) { + // int idx = expr.regionIdx(i); + // out[idx] = expr(idx); // single‐pass fusion + //} +} + +inline std::unordered_map> regionIndicesCache; + +struct StreamsRAII { + std::vector streams; + + cudaStream_t get() { + cudaStream_t stream = 0; + + if (streams.empty()) { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } else { + stream = streams.back(); + streams.pop_back(); + } + + return stream; + } + + void put(cudaStream_t stream) { streams.push_back(stream); } + + ~StreamsRAII() { + for (auto& stream : streams) { + cudaStreamDestroy(stream); + } + } + + StreamsRAII() = default; + StreamsRAII(const StreamsRAII&) = delete; + StreamsRAII(StreamsRAII&&) = delete; + StreamsRAII& operator=(const StreamsRAII&) = delete; + StreamsRAII& operator=(StreamsRAII&&) = delete; +}; +inline struct StreamsRAII streams; + +template +struct BinaryExpr { + typename L::View lhs; + typename R::View rhs; + Array indices; + Func f; + + Mesh* mesh; + CELL_LOC location = CELL_CENTRE; + DirectionTypes directions; + std::optional regionID; + + template + BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, + CELL_LOC location, DirectionTypes directions, std::optional regionID, + const Region& region) + //: lhs(static_cast(lhs)), rhs(static_cast(rhs)), + : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), + regionID(regionID), indices(region.getIndices().size()) { + // Copy the region indices into the managed array + for (int i = 0; i < indices.size(); ++i) { + indices[i] = region.getIndices()[i].ind; + } + //std::cout << "===PRE-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end PRE\n"; + //std::sort(indices.begin(), indices.end(), + // [](const auto& a, const auto& b) { return a < b; }); + //std::cout << "===POST-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end POST\n"; + //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) + // != regionIndicesCache.end()) { + // // If we have already computed the indices for this region, use them + // indices = + // regionIndicesCache[static_cast(const_cast*>(®ion))]; + //} else { + // // Otherwise, compute the indices and store them in the cache + // indices = Array(region.getIndices().size()); + // // Copy the region indices into the managed array + // for (int i = 0; i < indices.size(); ++i) { + // indices[i] = region.getIndices()[i].ind; + // } + // regionIndicesCache[static_cast(const_cast*>(®ion))] = + // indices; + //} + } + + BinaryExpr& operator=(BinaryExpr const&) = delete; + BinaryExpr& operator=(BinaryExpr&&) = delete; + + inline int size() const { return indices.size(); } + inline BoutReal operator()(int idx) const { + return f(idx, lhs, rhs); // single‐pass fusion + } + inline int regionIdx(int idx) const { return indices[idx]; } + + //operator ResT() { return ResT{*this}; } + struct View { + typename L::View lhs; + typename R::View rhs; + const int* indices; + int num_indices; + Func f; + int mul = 1; + int div = 1; + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + __host__ __device__ __forceinline__ int size() const { return num_indices; } + __host__ __device__ __forceinline__ int regionIdx(int idx) const { + return indices[idx]; + } + __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { + return f((idx * mul) / div, lhs, rhs); // single‐pass fusion + //return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion + } + }; + + operator View() { return View{lhs, rhs, &indices[0], indices.size(), f}; } + operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } + + void evaluate(BoutReal* data) const { +#if 1 + cudaStream_t stream = streams.get(); + int blocks = (size() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], static_cast(*this)); + cudaStreamSynchronize(stream); + streams.put(stream); +#endif + +#if 0 + // OpenMP impl. + int e = size(); + //#pragma omp parallel for + for (int i = 0; i < e; ++i) { + int idx = regionIdx(i); + data[idx] = operator()(idx); // single‐pass fusion + } +#endif + } + + Mesh* getMesh() const { return mesh; } + CELL_LOC getLocation() const { return location; } + DirectionTypes getDirections() const { return directions; } + std::optional getRegionID() const { return regionID; }; +}; + +#endif // BOUT_FIELDSOPS_HXX diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 6995308dbe..49d4fec1b7 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -86,6 +86,17 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + template < + typename ResT, typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> + FieldPerp(const BinaryExpr& expr) { + std::cout << "RUNNING FieldPerp constructor with CUDA\n"; + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), + /* yindex */ -1, expr.getDirections()}); + } + ~FieldPerp() override = default; /*! @@ -292,6 +303,26 @@ public: int size() const override { return nx * nz; }; + struct View { + BoutReal* data; + int mul = 1; + int div = 1; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } + private: /// The Y index at which this FieldPerp is defined int yindex{-1}; @@ -379,4 +410,12 @@ bool operator==(const FieldPerp& a, const FieldPerp& b); /// Output a string describing a FieldPerp to a stream std::ostream& operator<<(std::ostream& out, const FieldPerp& value); +template <> +struct is_expr_fieldperp : std::true_type {}; + +template +struct is_expr_fieldperp> + : std::integral_constant> + && is_expr_fieldperp_v>> {}; + #endif diff --git a/include/bout/interpolation.hxx b/include/bout/interpolation.hxx index 2c7df4472d..85c04cf897 100644 --- a/include/bout/interpolation.hxx +++ b/include/bout/interpolation.hxx @@ -55,7 +55,8 @@ inline BoutReal interp(const stencil& s) { @param[in] region Region where output will be calculated */ template -const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { +std::enable_if_t || bout::utils::is_Field3D_v, const T> +interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { AUTO_TRACE(); static_assert(bout::utils::is_Field2D_v || bout::utils::is_Field3D_v, "interp_to must be templated with one of Field2D or Field3D."); @@ -203,4 +204,17 @@ const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_AL return result; } + +template +std::enable_if_t && !bout::utils::is_Field3D_v, const Field3D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field3D{expr}, loc, std::move(rgn) ); +} + +template +std::enable_if_t && !bout::utils::is_Field2D_v, const Field2D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field2D{expr}, loc, std::move(rgn) ); +} + #endif // BOUT_INTERP_H diff --git a/include/bout/mesh.hxx b/include/bout/mesh.hxx index a1c88a2634..b6553d06ec 100644 --- a/include/bout/mesh.hxx +++ b/include/bout/mesh.hxx @@ -762,6 +762,11 @@ public: return {(indPerp.ind - jz) * LocalNy + LocalNz * jy + jz, LocalNy, LocalNz}; } + BOUT_HOST_DEVICE int flatIndPerpto3D(const int& flatIndPerp, const int nz, int jy = 0) const { + int jz = flatIndPerp % nz; + return (flatIndPerp - jz) * LocalNy + LocalNz * jy + jz; + } + /// Converts an Ind3D to an Ind2D representing a 2D index using a lookup -- to be used with care Ind2D map3Dto2D(const Ind3D& ind3D) { return {indexLookup3Dto2D[ind3D.ind], LocalNy, 1}; diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index b9f6913459..20929304b5 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -14,6 +14,7 @@ */ #pragma once +#include "bout/array.hxx" #ifndef RAJALIB_H #define RAJALIB_H @@ -23,6 +24,15 @@ #include "RAJA/RAJA.hpp" // using RAJA lib +#if BOUT_HAS_CUDA +// TODO: Make configurable +const int CUDA_BLOCK_SIZE = 256; +using EXEC_POL = RAJA::cuda_exec; +//using EXEC_POL = RAJA::loop_exec; +#else // not BOUT_USE_CUDA +using EXEC_POL = RAJA::loop_exec; +#endif // end BOUT_USE_CUDA + /// Wrapper around RAJA::forall /// Enables computations to be done on CPU or GPU (CUDA). /// @@ -81,7 +91,7 @@ struct RajaForAll { // Note: must be a local variable const int* _ob_i_ind_raw = &_ob_i_ind[0]; RAJA::forall(RAJA::RangeSegment(0, _ob_i_ind.size()), - [=] RAJA_DEVICE(int id) { + [=] RAJA_DEVICE(int id) mutable { // Look up index and call user function f(_ob_i_ind_raw[id]); }); @@ -127,7 +137,7 @@ private: /// to create variables which shadow the class members. /// #define BOUT_FOR_RAJA(index, region, ...) \ - RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) +RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable #else // BOUT_HAS_RAJA diff --git a/include/bout/single_index_ops.hxx b/include/bout/single_index_ops.hxx index 60bd78bc36..c29d1a471f 100644 --- a/include/bout/single_index_ops.hxx +++ b/include/bout/single_index_ops.hxx @@ -7,17 +7,6 @@ #include "field_accessor.hxx" -#if BOUT_HAS_RAJA -//-- RAJA CUDA settings--------------------------------------------------------start -#if BOUT_HAS_CUDA -const int CUDA_BLOCK_SIZE = 256; // TODO: Make configurable -using EXEC_POL = RAJA::cuda_exec; -#else // not BOUT_USE_CUDA -using EXEC_POL = RAJA::loop_exec; -#endif // end BOUT_USE_CUDA -////-----------CUDA settings------------------------------------------------------end -#endif // end BOUT_HAS_RAJA - // Ind3D: i.zp(): BOUT_HOST_DEVICE inline int i_zp(const int id, const int nz) { int jz = id % nz; diff --git a/include/bout/twiddle.hxx b/include/bout/twiddle.hxx new file mode 100644 index 0000000000..6da72dd8ff --- /dev/null +++ b/include/bout/twiddle.hxx @@ -0,0 +1,1025 @@ +__constant__ double2 c_twiddle_16[16] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9238795325112867, -0.3826834323650898}, // k=1 + {0.7071067811865476, -0.7071067811865475}, // k=2 + {0.3826834323650898, -0.9238795325112867}, // k=3 + {0.0000000000000001, -1.0000000000000000}, // k=4 + {-0.3826834323650897, -0.9238795325112867}, // k=5 + {-0.7071067811865475, -0.7071067811865476}, // k=6 + {-0.9238795325112867, -0.3826834323650899}, // k=7 + {-1.0000000000000000, -0.0000000000000001}, // k=8 + {-0.9238795325112868, 0.3826834323650897}, // k=9 + {-0.7071067811865477, 0.7071067811865475}, // k=10 + {-0.3826834323650903, 0.9238795325112865}, // k=11 + {-0.0000000000000002, 1.0000000000000000}, // k=12 + {0.3826834323650900, 0.9238795325112866}, // k=13 + {0.7071067811865474, 0.7071067811865477}, // k=14 + {0.9238795325112865, 0.3826834323650904}, // k=15 +}; + +__constant__ double2 c_twiddle_32[32] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9807852804032304, -0.1950903220161282}, // k=1 + {0.9238795325112867, -0.3826834323650898}, // k=2 + {0.8314696123025452, -0.5555702330196022}, // k=3 + {0.7071067811865476, -0.7071067811865475}, // k=4 + {0.5555702330196023, -0.8314696123025452}, // k=5 + {0.3826834323650898, -0.9238795325112867}, // k=6 + {0.1950903220161283, -0.9807852804032304}, // k=7 + {0.0000000000000001, -1.0000000000000000}, // k=8 + {-0.1950903220161282, -0.9807852804032304}, // k=9 + {-0.3826834323650897, -0.9238795325112867}, // k=10 + {-0.5555702330196020, -0.8314696123025455}, // k=11 + {-0.7071067811865475, -0.7071067811865476}, // k=12 + {-0.8314696123025453, -0.5555702330196022}, // k=13 + {-0.9238795325112867, -0.3826834323650899}, // k=14 + {-0.9807852804032304, -0.1950903220161286}, // k=15 + {-1.0000000000000000, -0.0000000000000001}, // k=16 + {-0.9807852804032304, 0.1950903220161284}, // k=17 + {-0.9238795325112868, 0.3826834323650897}, // k=18 + {-0.8314696123025455, 0.5555702330196020}, // k=19 + {-0.7071067811865477, 0.7071067811865475}, // k=20 + {-0.5555702330196022, 0.8314696123025452}, // k=21 + {-0.3826834323650903, 0.9238795325112865}, // k=22 + {-0.1950903220161287, 0.9807852804032303}, // k=23 + {-0.0000000000000002, 1.0000000000000000}, // k=24 + {0.1950903220161283, 0.9807852804032304}, // k=25 + {0.3826834323650900, 0.9238795325112866}, // k=26 + {0.5555702330196018, 0.8314696123025455}, // k=27 + {0.7071067811865474, 0.7071067811865477}, // k=28 + {0.8314696123025452, 0.5555702330196022}, // k=29 + {0.9238795325112865, 0.3826834323650904}, // k=30 + {0.9807852804032303, 0.1950903220161287}, // k=31 +}; + +__constant__ double2 c_twiddle_64[64] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9951847266721969, -0.0980171403295606}, // k=1 + {0.9807852804032304, -0.1950903220161282}, // k=2 + {0.9569403357322088, -0.2902846772544623}, // k=3 + {0.9238795325112867, -0.3826834323650898}, // k=4 + {0.8819212643483550, -0.4713967368259976}, // k=5 + {0.8314696123025452, -0.5555702330196022}, // k=6 + {0.7730104533627370, -0.6343932841636455}, // k=7 + {0.7071067811865476, -0.7071067811865475}, // k=8 + {0.6343932841636455, -0.7730104533627370}, // k=9 + {0.5555702330196023, -0.8314696123025452}, // k=10 + {0.4713967368259978, -0.8819212643483549}, // k=11 + {0.3826834323650898, -0.9238795325112867}, // k=12 + {0.2902846772544623, -0.9569403357322089}, // k=13 + {0.1950903220161283, -0.9807852804032304}, // k=14 + {0.0980171403295608, -0.9951847266721968}, // k=15 + {0.0000000000000001, -1.0000000000000000}, // k=16 + {-0.0980171403295606, -0.9951847266721969}, // k=17 + {-0.1950903220161282, -0.9807852804032304}, // k=18 + {-0.2902846772544622, -0.9569403357322089}, // k=19 + {-0.3826834323650897, -0.9238795325112867}, // k=20 + {-0.4713967368259977, -0.8819212643483550}, // k=21 + {-0.5555702330196020, -0.8314696123025455}, // k=22 + {-0.6343932841636454, -0.7730104533627371}, // k=23 + {-0.7071067811865475, -0.7071067811865476}, // k=24 + {-0.7730104533627370, -0.6343932841636455}, // k=25 + {-0.8314696123025453, -0.5555702330196022}, // k=26 + {-0.8819212643483549, -0.4713967368259979}, // k=27 + {-0.9238795325112867, -0.3826834323650899}, // k=28 + {-0.9569403357322088, -0.2902846772544624}, // k=29 + {-0.9807852804032304, -0.1950903220161286}, // k=30 + {-0.9951847266721968, -0.0980171403295608}, // k=31 + {-1.0000000000000000, -0.0000000000000001}, // k=32 + {-0.9951847266721969, 0.0980171403295606}, // k=33 + {-0.9807852804032304, 0.1950903220161284}, // k=34 + {-0.9569403357322089, 0.2902846772544621}, // k=35 + {-0.9238795325112868, 0.3826834323650897}, // k=36 + {-0.8819212643483550, 0.4713967368259976}, // k=37 + {-0.8314696123025455, 0.5555702330196020}, // k=38 + {-0.7730104533627371, 0.6343932841636453}, // k=39 + {-0.7071067811865477, 0.7071067811865475}, // k=40 + {-0.6343932841636459, 0.7730104533627367}, // k=41 + {-0.5555702330196022, 0.8314696123025452}, // k=42 + {-0.4713967368259979, 0.8819212643483549}, // k=43 + {-0.3826834323650903, 0.9238795325112865}, // k=44 + {-0.2902846772544624, 0.9569403357322088}, // k=45 + {-0.1950903220161287, 0.9807852804032303}, // k=46 + {-0.0980171403295605, 0.9951847266721969}, // k=47 + {-0.0000000000000002, 1.0000000000000000}, // k=48 + {0.0980171403295601, 0.9951847266721969}, // k=49 + {0.1950903220161283, 0.9807852804032304}, // k=50 + {0.2902846772544621, 0.9569403357322089}, // k=51 + {0.3826834323650900, 0.9238795325112866}, // k=52 + {0.4713967368259976, 0.8819212643483550}, // k=53 + {0.5555702330196018, 0.8314696123025455}, // k=54 + {0.6343932841636456, 0.7730104533627369}, // k=55 + {0.7071067811865474, 0.7071067811865477}, // k=56 + {0.7730104533627367, 0.6343932841636459}, // k=57 + {0.8314696123025452, 0.5555702330196022}, // k=58 + {0.8819212643483548, 0.4713967368259979}, // k=59 + {0.9238795325112865, 0.3826834323650904}, // k=60 + {0.9569403357322088, 0.2902846772544625}, // k=61 + {0.9807852804032303, 0.1950903220161287}, // k=62 + {0.9951847266721969, 0.0980171403295605}, // k=63 +}; + +__constant__ double2 c_twiddle_128[128] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9987954562051724, -0.0490676743274180}, // k=1 + {0.9951847266721969, -0.0980171403295606}, // k=2 + {0.9891765099647810, -0.1467304744553617}, // k=3 + {0.9807852804032304, -0.1950903220161282}, // k=4 + {0.9700312531945440, -0.2429801799032639}, // k=5 + {0.9569403357322088, -0.2902846772544623}, // k=6 + {0.9415440651830208, -0.3368898533922201}, // k=7 + {0.9238795325112867, -0.3826834323650898}, // k=8 + {0.9039892931234433, -0.4275550934302821}, // k=9 + {0.8819212643483550, -0.4713967368259976}, // k=10 + {0.8577286100002721, -0.5141027441932217}, // k=11 + {0.8314696123025452, -0.5555702330196022}, // k=12 + {0.8032075314806449, -0.5956993044924334}, // k=13 + {0.7730104533627370, -0.6343932841636455}, // k=14 + {0.7409511253549591, -0.6715589548470183}, // k=15 + {0.7071067811865476, -0.7071067811865475}, // k=16 + {0.6715589548470183, -0.7409511253549591}, // k=17 + {0.6343932841636455, -0.7730104533627370}, // k=18 + {0.5956993044924335, -0.8032075314806448}, // k=19 + {0.5555702330196023, -0.8314696123025452}, // k=20 + {0.5141027441932217, -0.8577286100002721}, // k=21 + {0.4713967368259978, -0.8819212643483549}, // k=22 + {0.4275550934302822, -0.9039892931234433}, // k=23 + {0.3826834323650898, -0.9238795325112867}, // k=24 + {0.3368898533922201, -0.9415440651830208}, // k=25 + {0.2902846772544623, -0.9569403357322089}, // k=26 + {0.2429801799032640, -0.9700312531945440}, // k=27 + {0.1950903220161283, -0.9807852804032304}, // k=28 + {0.1467304744553617, -0.9891765099647810}, // k=29 + {0.0980171403295608, -0.9951847266721968}, // k=30 + {0.0490676743274181, -0.9987954562051724}, // k=31 + {0.0000000000000001, -1.0000000000000000}, // k=32 + {-0.0490676743274180, -0.9987954562051724}, // k=33 + {-0.0980171403295606, -0.9951847266721969}, // k=34 + {-0.1467304744553616, -0.9891765099647810}, // k=35 + {-0.1950903220161282, -0.9807852804032304}, // k=36 + {-0.2429801799032639, -0.9700312531945440}, // k=37 + {-0.2902846772544622, -0.9569403357322089}, // k=38 + {-0.3368898533922199, -0.9415440651830208}, // k=39 + {-0.3826834323650897, -0.9238795325112867}, // k=40 + {-0.4275550934302819, -0.9039892931234434}, // k=41 + {-0.4713967368259977, -0.8819212643483550}, // k=42 + {-0.5141027441932217, -0.8577286100002721}, // k=43 + {-0.5555702330196020, -0.8314696123025455}, // k=44 + {-0.5956993044924334, -0.8032075314806449}, // k=45 + {-0.6343932841636454, -0.7730104533627371}, // k=46 + {-0.6715589548470184, -0.7409511253549590}, // k=47 + {-0.7071067811865475, -0.7071067811865476}, // k=48 + {-0.7409511253549589, -0.6715589548470186}, // k=49 + {-0.7730104533627370, -0.6343932841636455}, // k=50 + {-0.8032075314806448, -0.5956993044924335}, // k=51 + {-0.8314696123025453, -0.5555702330196022}, // k=52 + {-0.8577286100002720, -0.5141027441932218}, // k=53 + {-0.8819212643483549, -0.4713967368259979}, // k=54 + {-0.9039892931234433, -0.4275550934302820}, // k=55 + {-0.9238795325112867, -0.3826834323650899}, // k=56 + {-0.9415440651830207, -0.3368898533922203}, // k=57 + {-0.9569403357322088, -0.2902846772544624}, // k=58 + {-0.9700312531945440, -0.2429801799032641}, // k=59 + {-0.9807852804032304, -0.1950903220161286}, // k=60 + {-0.9891765099647810, -0.1467304744553618}, // k=61 + {-0.9951847266721968, -0.0980171403295608}, // k=62 + {-0.9987954562051724, -0.0490676743274180}, // k=63 + {-1.0000000000000000, -0.0000000000000001}, // k=64 + {-0.9987954562051724, 0.0490676743274177}, // k=65 + {-0.9951847266721969, 0.0980171403295606}, // k=66 + {-0.9891765099647810, 0.1467304744553616}, // k=67 + {-0.9807852804032304, 0.1950903220161284}, // k=68 + {-0.9700312531945440, 0.2429801799032638}, // k=69 + {-0.9569403357322089, 0.2902846772544621}, // k=70 + {-0.9415440651830208, 0.3368898533922201}, // k=71 + {-0.9238795325112868, 0.3826834323650897}, // k=72 + {-0.9039892931234434, 0.4275550934302818}, // k=73 + {-0.8819212643483550, 0.4713967368259976}, // k=74 + {-0.8577286100002721, 0.5141027441932216}, // k=75 + {-0.8314696123025455, 0.5555702330196020}, // k=76 + {-0.8032075314806449, 0.5956993044924332}, // k=77 + {-0.7730104533627371, 0.6343932841636453}, // k=78 + {-0.7409511253549591, 0.6715589548470184}, // k=79 + {-0.7071067811865477, 0.7071067811865475}, // k=80 + {-0.6715589548470187, 0.7409511253549589}, // k=81 + {-0.6343932841636459, 0.7730104533627367}, // k=82 + {-0.5956993044924331, 0.8032075314806451}, // k=83 + {-0.5555702330196022, 0.8314696123025452}, // k=84 + {-0.5141027441932218, 0.8577286100002720}, // k=85 + {-0.4713967368259979, 0.8819212643483549}, // k=86 + {-0.4275550934302825, 0.9039892931234431}, // k=87 + {-0.3826834323650903, 0.9238795325112865}, // k=88 + {-0.3368898533922199, 0.9415440651830208}, // k=89 + {-0.2902846772544624, 0.9569403357322088}, // k=90 + {-0.2429801799032641, 0.9700312531945440}, // k=91 + {-0.1950903220161287, 0.9807852804032303}, // k=92 + {-0.1467304744553623, 0.9891765099647809}, // k=93 + {-0.0980171403295605, 0.9951847266721969}, // k=94 + {-0.0490676743274180, 0.9987954562051724}, // k=95 + {-0.0000000000000002, 1.0000000000000000}, // k=96 + {0.0490676743274177, 0.9987954562051724}, // k=97 + {0.0980171403295601, 0.9951847266721969}, // k=98 + {0.1467304744553619, 0.9891765099647809}, // k=99 + {0.1950903220161283, 0.9807852804032304}, // k=100 + {0.2429801799032638, 0.9700312531945440}, // k=101 + {0.2902846772544621, 0.9569403357322089}, // k=102 + {0.3368898533922196, 0.9415440651830209}, // k=103 + {0.3826834323650900, 0.9238795325112866}, // k=104 + {0.4275550934302821, 0.9039892931234433}, // k=105 + {0.4713967368259976, 0.8819212643483550}, // k=106 + {0.5141027441932216, 0.8577286100002722}, // k=107 + {0.5555702330196018, 0.8314696123025455}, // k=108 + {0.5956993044924329, 0.8032075314806453}, // k=109 + {0.6343932841636456, 0.7730104533627369}, // k=110 + {0.6715589548470183, 0.7409511253549591}, // k=111 + {0.7071067811865474, 0.7071067811865477}, // k=112 + {0.7409511253549589, 0.6715589548470187}, // k=113 + {0.7730104533627367, 0.6343932841636459}, // k=114 + {0.8032075314806451, 0.5956993044924332}, // k=115 + {0.8314696123025452, 0.5555702330196022}, // k=116 + {0.8577286100002720, 0.5141027441932219}, // k=117 + {0.8819212643483548, 0.4713967368259979}, // k=118 + {0.9039892931234431, 0.4275550934302825}, // k=119 + {0.9238795325112865, 0.3826834323650904}, // k=120 + {0.9415440651830208, 0.3368898533922200}, // k=121 + {0.9569403357322088, 0.2902846772544625}, // k=122 + {0.9700312531945440, 0.2429801799032642}, // k=123 + {0.9807852804032303, 0.1950903220161287}, // k=124 + {0.9891765099647809, 0.1467304744553624}, // k=125 + {0.9951847266721969, 0.0980171403295605}, // k=126 + {0.9987954562051724, 0.0490676743274181}, // k=127 +}; + +__constant__ double2 c_twiddle_256[256] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9996988186962042, -0.0245412285229123}, // k=1 + {0.9987954562051724, -0.0490676743274180}, // k=2 + {0.9972904566786902, -0.0735645635996674}, // k=3 + {0.9951847266721969, -0.0980171403295606}, // k=4 + {0.9924795345987100, -0.1224106751992162}, // k=5 + {0.9891765099647810, -0.1467304744553617}, // k=6 + {0.9852776423889412, -0.1709618887603012}, // k=7 + {0.9807852804032304, -0.1950903220161282}, // k=8 + {0.9757021300385286, -0.2191012401568698}, // k=9 + {0.9700312531945440, -0.2429801799032639}, // k=10 + {0.9637760657954398, -0.2667127574748984}, // k=11 + {0.9569403357322088, -0.2902846772544623}, // k=12 + {0.9495281805930367, -0.3136817403988915}, // k=13 + {0.9415440651830208, -0.3368898533922201}, // k=14 + {0.9329927988347390, -0.3598950365349881}, // k=15 + {0.9238795325112867, -0.3826834323650898}, // k=16 + {0.9142097557035307, -0.4052413140049899}, // k=17 + {0.9039892931234433, -0.4275550934302821}, // k=18 + {0.8932243011955153, -0.4496113296546065}, // k=19 + {0.8819212643483550, -0.4713967368259976}, // k=20 + {0.8700869911087115, -0.4928981922297840}, // k=21 + {0.8577286100002721, -0.5141027441932217}, // k=22 + {0.8448535652497071, -0.5349976198870972}, // k=23 + {0.8314696123025452, -0.5555702330196022}, // k=24 + {0.8175848131515837, -0.5758081914178453}, // k=25 + {0.8032075314806449, -0.5956993044924334}, // k=26 + {0.7883464276266063, -0.6152315905806268}, // k=27 + {0.7730104533627370, -0.6343932841636455}, // k=28 + {0.7572088465064846, -0.6531728429537768}, // k=29 + {0.7409511253549591, -0.6715589548470183}, // k=30 + {0.7242470829514670, -0.6895405447370668}, // k=31 + {0.7071067811865476, -0.7071067811865475}, // k=32 + {0.6895405447370669, -0.7242470829514669}, // k=33 + {0.6715589548470183, -0.7409511253549591}, // k=34 + {0.6531728429537768, -0.7572088465064845}, // k=35 + {0.6343932841636455, -0.7730104533627370}, // k=36 + {0.6152315905806268, -0.7883464276266062}, // k=37 + {0.5956993044924335, -0.8032075314806448}, // k=38 + {0.5758081914178453, -0.8175848131515837}, // k=39 + {0.5555702330196023, -0.8314696123025452}, // k=40 + {0.5349976198870973, -0.8448535652497070}, // k=41 + {0.5141027441932217, -0.8577286100002721}, // k=42 + {0.4928981922297841, -0.8700869911087113}, // k=43 + {0.4713967368259978, -0.8819212643483549}, // k=44 + {0.4496113296546066, -0.8932243011955153}, // k=45 + {0.4275550934302822, -0.9039892931234433}, // k=46 + {0.4052413140049899, -0.9142097557035307}, // k=47 + {0.3826834323650898, -0.9238795325112867}, // k=48 + {0.3598950365349883, -0.9329927988347388}, // k=49 + {0.3368898533922201, -0.9415440651830208}, // k=50 + {0.3136817403988916, -0.9495281805930367}, // k=51 + {0.2902846772544623, -0.9569403357322089}, // k=52 + {0.2667127574748984, -0.9637760657954398}, // k=53 + {0.2429801799032640, -0.9700312531945440}, // k=54 + {0.2191012401568698, -0.9757021300385286}, // k=55 + {0.1950903220161283, -0.9807852804032304}, // k=56 + {0.1709618887603014, -0.9852776423889412}, // k=57 + {0.1467304744553617, -0.9891765099647810}, // k=58 + {0.1224106751992163, -0.9924795345987100}, // k=59 + {0.0980171403295608, -0.9951847266721968}, // k=60 + {0.0735645635996675, -0.9972904566786902}, // k=61 + {0.0490676743274181, -0.9987954562051724}, // k=62 + {0.0245412285229123, -0.9996988186962042}, // k=63 + {0.0000000000000001, -1.0000000000000000}, // k=64 + {-0.0245412285229121, -0.9996988186962042}, // k=65 + {-0.0490676743274180, -0.9987954562051724}, // k=66 + {-0.0735645635996673, -0.9972904566786902}, // k=67 + {-0.0980171403295606, -0.9951847266721969}, // k=68 + {-0.1224106751992162, -0.9924795345987100}, // k=69 + {-0.1467304744553616, -0.9891765099647810}, // k=70 + {-0.1709618887603012, -0.9852776423889412}, // k=71 + {-0.1950903220161282, -0.9807852804032304}, // k=72 + {-0.2191012401568697, -0.9757021300385286}, // k=73 + {-0.2429801799032639, -0.9700312531945440}, // k=74 + {-0.2667127574748983, -0.9637760657954398}, // k=75 + {-0.2902846772544622, -0.9569403357322089}, // k=76 + {-0.3136817403988914, -0.9495281805930367}, // k=77 + {-0.3368898533922199, -0.9415440651830208}, // k=78 + {-0.3598950365349882, -0.9329927988347388}, // k=79 + {-0.3826834323650897, -0.9238795325112867}, // k=80 + {-0.4052413140049897, -0.9142097557035307}, // k=81 + {-0.4275550934302819, -0.9039892931234434}, // k=82 + {-0.4496113296546067, -0.8932243011955152}, // k=83 + {-0.4713967368259977, -0.8819212643483550}, // k=84 + {-0.4928981922297840, -0.8700869911087115}, // k=85 + {-0.5141027441932217, -0.8577286100002721}, // k=86 + {-0.5349976198870970, -0.8448535652497072}, // k=87 + {-0.5555702330196020, -0.8314696123025455}, // k=88 + {-0.5758081914178453, -0.8175848131515837}, // k=89 + {-0.5956993044924334, -0.8032075314806449}, // k=90 + {-0.6152315905806267, -0.7883464276266063}, // k=91 + {-0.6343932841636454, -0.7730104533627371}, // k=92 + {-0.6531728429537765, -0.7572088465064847}, // k=93 + {-0.6715589548470184, -0.7409511253549590}, // k=94 + {-0.6895405447370669, -0.7242470829514669}, // k=95 + {-0.7071067811865475, -0.7071067811865476}, // k=96 + {-0.7242470829514668, -0.6895405447370671}, // k=97 + {-0.7409511253549589, -0.6715589548470186}, // k=98 + {-0.7572088465064846, -0.6531728429537766}, // k=99 + {-0.7730104533627370, -0.6343932841636455}, // k=100 + {-0.7883464276266062, -0.6152315905806269}, // k=101 + {-0.8032075314806448, -0.5956993044924335}, // k=102 + {-0.8175848131515836, -0.5758081914178454}, // k=103 + {-0.8314696123025453, -0.5555702330196022}, // k=104 + {-0.8448535652497071, -0.5349976198870972}, // k=105 + {-0.8577286100002720, -0.5141027441932218}, // k=106 + {-0.8700869911087113, -0.4928981922297841}, // k=107 + {-0.8819212643483549, -0.4713967368259979}, // k=108 + {-0.8932243011955152, -0.4496113296546069}, // k=109 + {-0.9039892931234433, -0.4275550934302820}, // k=110 + {-0.9142097557035307, -0.4052413140049899}, // k=111 + {-0.9238795325112867, -0.3826834323650899}, // k=112 + {-0.9329927988347388, -0.3598950365349883}, // k=113 + {-0.9415440651830207, -0.3368898533922203}, // k=114 + {-0.9495281805930367, -0.3136817403988914}, // k=115 + {-0.9569403357322088, -0.2902846772544624}, // k=116 + {-0.9637760657954398, -0.2667127574748985}, // k=117 + {-0.9700312531945440, -0.2429801799032641}, // k=118 + {-0.9757021300385285, -0.2191012401568700}, // k=119 + {-0.9807852804032304, -0.1950903220161286}, // k=120 + {-0.9852776423889412, -0.1709618887603012}, // k=121 + {-0.9891765099647810, -0.1467304744553618}, // k=122 + {-0.9924795345987100, -0.1224106751992163}, // k=123 + {-0.9951847266721968, -0.0980171403295608}, // k=124 + {-0.9972904566786902, -0.0735645635996677}, // k=125 + {-0.9987954562051724, -0.0490676743274180}, // k=126 + {-0.9996988186962042, -0.0245412285229123}, // k=127 + {-1.0000000000000000, -0.0000000000000001}, // k=128 + {-0.9996988186962042, 0.0245412285229121}, // k=129 + {-0.9987954562051724, 0.0490676743274177}, // k=130 + {-0.9972904566786902, 0.0735645635996675}, // k=131 + {-0.9951847266721969, 0.0980171403295606}, // k=132 + {-0.9924795345987100, 0.1224106751992161}, // k=133 + {-0.9891765099647810, 0.1467304744553616}, // k=134 + {-0.9852776423889413, 0.1709618887603010}, // k=135 + {-0.9807852804032304, 0.1950903220161284}, // k=136 + {-0.9757021300385286, 0.2191012401568698}, // k=137 + {-0.9700312531945440, 0.2429801799032638}, // k=138 + {-0.9637760657954400, 0.2667127574748983}, // k=139 + {-0.9569403357322089, 0.2902846772544621}, // k=140 + {-0.9495281805930368, 0.3136817403988912}, // k=141 + {-0.9415440651830208, 0.3368898533922201}, // k=142 + {-0.9329927988347390, 0.3598950365349881}, // k=143 + {-0.9238795325112868, 0.3826834323650897}, // k=144 + {-0.9142097557035307, 0.4052413140049897}, // k=145 + {-0.9039892931234434, 0.4275550934302818}, // k=146 + {-0.8932243011955153, 0.4496113296546067}, // k=147 + {-0.8819212643483550, 0.4713967368259976}, // k=148 + {-0.8700869911087115, 0.4928981922297839}, // k=149 + {-0.8577286100002721, 0.5141027441932216}, // k=150 + {-0.8448535652497072, 0.5349976198870969}, // k=151 + {-0.8314696123025455, 0.5555702330196020}, // k=152 + {-0.8175848131515837, 0.5758081914178453}, // k=153 + {-0.8032075314806449, 0.5956993044924332}, // k=154 + {-0.7883464276266063, 0.6152315905806267}, // k=155 + {-0.7730104533627371, 0.6343932841636453}, // k=156 + {-0.7572088465064848, 0.6531728429537765}, // k=157 + {-0.7409511253549591, 0.6715589548470184}, // k=158 + {-0.7242470829514670, 0.6895405447370668}, // k=159 + {-0.7071067811865477, 0.7071067811865475}, // k=160 + {-0.6895405447370671, 0.7242470829514668}, // k=161 + {-0.6715589548470187, 0.7409511253549589}, // k=162 + {-0.6531728429537771, 0.7572088465064842}, // k=163 + {-0.6343932841636459, 0.7730104533627367}, // k=164 + {-0.6152315905806273, 0.7883464276266059}, // k=165 + {-0.5956993044924331, 0.8032075314806451}, // k=166 + {-0.5758081914178452, 0.8175848131515838}, // k=167 + {-0.5555702330196022, 0.8314696123025452}, // k=168 + {-0.5349976198870973, 0.8448535652497070}, // k=169 + {-0.5141027441932218, 0.8577286100002720}, // k=170 + {-0.4928981922297842, 0.8700869911087113}, // k=171 + {-0.4713967368259979, 0.8819212643483549}, // k=172 + {-0.4496113296546069, 0.8932243011955152}, // k=173 + {-0.4275550934302825, 0.9039892931234431}, // k=174 + {-0.4052413140049904, 0.9142097557035305}, // k=175 + {-0.3826834323650903, 0.9238795325112865}, // k=176 + {-0.3598950365349879, 0.9329927988347390}, // k=177 + {-0.3368898533922199, 0.9415440651830208}, // k=178 + {-0.3136817403988915, 0.9495281805930367}, // k=179 + {-0.2902846772544624, 0.9569403357322088}, // k=180 + {-0.2667127574748985, 0.9637760657954398}, // k=181 + {-0.2429801799032641, 0.9700312531945440}, // k=182 + {-0.2191012401568701, 0.9757021300385285}, // k=183 + {-0.1950903220161287, 0.9807852804032303}, // k=184 + {-0.1709618887603017, 0.9852776423889411}, // k=185 + {-0.1467304744553623, 0.9891765099647809}, // k=186 + {-0.1224106751992160, 0.9924795345987101}, // k=187 + {-0.0980171403295605, 0.9951847266721969}, // k=188 + {-0.0735645635996674, 0.9972904566786902}, // k=189 + {-0.0490676743274180, 0.9987954562051724}, // k=190 + {-0.0245412285229124, 0.9996988186962042}, // k=191 + {-0.0000000000000002, 1.0000000000000000}, // k=192 + {0.0245412285229120, 0.9996988186962042}, // k=193 + {0.0490676743274177, 0.9987954562051724}, // k=194 + {0.0735645635996670, 0.9972904566786902}, // k=195 + {0.0980171403295601, 0.9951847266721969}, // k=196 + {0.1224106751992156, 0.9924795345987101}, // k=197 + {0.1467304744553619, 0.9891765099647809}, // k=198 + {0.1709618887603013, 0.9852776423889412}, // k=199 + {0.1950903220161283, 0.9807852804032304}, // k=200 + {0.2191012401568697, 0.9757021300385286}, // k=201 + {0.2429801799032638, 0.9700312531945440}, // k=202 + {0.2667127574748982, 0.9637760657954400}, // k=203 + {0.2902846772544621, 0.9569403357322089}, // k=204 + {0.3136817403988911, 0.9495281805930368}, // k=205 + {0.3368898533922196, 0.9415440651830209}, // k=206 + {0.3598950365349876, 0.9329927988347391}, // k=207 + {0.3826834323650900, 0.9238795325112866}, // k=208 + {0.4052413140049900, 0.9142097557035306}, // k=209 + {0.4275550934302821, 0.9039892931234433}, // k=210 + {0.4496113296546066, 0.8932243011955153}, // k=211 + {0.4713967368259976, 0.8819212643483550}, // k=212 + {0.4928981922297839, 0.8700869911087115}, // k=213 + {0.5141027441932216, 0.8577286100002722}, // k=214 + {0.5349976198870969, 0.8448535652497072}, // k=215 + {0.5555702330196018, 0.8314696123025455}, // k=216 + {0.5758081914178449, 0.8175848131515840}, // k=217 + {0.5956993044924329, 0.8032075314806453}, // k=218 + {0.6152315905806270, 0.7883464276266061}, // k=219 + {0.6343932841636456, 0.7730104533627369}, // k=220 + {0.6531728429537768, 0.7572088465064846}, // k=221 + {0.6715589548470183, 0.7409511253549591}, // k=222 + {0.6895405447370668, 0.7242470829514670}, // k=223 + {0.7071067811865474, 0.7071067811865477}, // k=224 + {0.7242470829514667, 0.6895405447370672}, // k=225 + {0.7409511253549589, 0.6715589548470187}, // k=226 + {0.7572088465064842, 0.6531728429537771}, // k=227 + {0.7730104533627367, 0.6343932841636459}, // k=228 + {0.7883464276266059, 0.6152315905806274}, // k=229 + {0.8032075314806451, 0.5956993044924332}, // k=230 + {0.8175848131515837, 0.5758081914178452}, // k=231 + {0.8314696123025452, 0.5555702330196022}, // k=232 + {0.8448535652497070, 0.5349976198870973}, // k=233 + {0.8577286100002720, 0.5141027441932219}, // k=234 + {0.8700869911087113, 0.4928981922297843}, // k=235 + {0.8819212643483548, 0.4713967368259979}, // k=236 + {0.8932243011955151, 0.4496113296546070}, // k=237 + {0.9039892931234431, 0.4275550934302825}, // k=238 + {0.9142097557035305, 0.4052413140049904}, // k=239 + {0.9238795325112865, 0.3826834323650904}, // k=240 + {0.9329927988347390, 0.3598950365349880}, // k=241 + {0.9415440651830208, 0.3368898533922200}, // k=242 + {0.9495281805930367, 0.3136817403988915}, // k=243 + {0.9569403357322088, 0.2902846772544625}, // k=244 + {0.9637760657954398, 0.2667127574748986}, // k=245 + {0.9700312531945440, 0.2429801799032642}, // k=246 + {0.9757021300385285, 0.2191012401568702}, // k=247 + {0.9807852804032303, 0.1950903220161287}, // k=248 + {0.9852776423889411, 0.1709618887603018}, // k=249 + {0.9891765099647809, 0.1467304744553624}, // k=250 + {0.9924795345987100, 0.1224106751992160}, // k=251 + {0.9951847266721969, 0.0980171403295605}, // k=252 + {0.9972904566786902, 0.0735645635996674}, // k=253 + {0.9987954562051724, 0.0490676743274181}, // k=254 + {0.9996988186962042, 0.0245412285229124}, // k=255 +}; + +__constant__ double2 c_twiddle_512[512] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9999247018391445, -0.0122715382857199}, // k=1 + {0.9996988186962042, -0.0245412285229123}, // k=2 + {0.9993223845883495, -0.0368072229413588}, // k=3 + {0.9987954562051724, -0.0490676743274180}, // k=4 + {0.9981181129001492, -0.0613207363022086}, // k=5 + {0.9972904566786902, -0.0735645635996674}, // k=6 + {0.9963126121827780, -0.0857973123444399}, // k=7 + {0.9951847266721969, -0.0980171403295606}, // k=8 + {0.9939069700023561, -0.1102222072938831}, // k=9 + {0.9924795345987100, -0.1224106751992162}, // k=10 + {0.9909026354277800, -0.1345807085071262}, // k=11 + {0.9891765099647810, -0.1467304744553617}, // k=12 + {0.9873014181578584, -0.1588581433338614}, // k=13 + {0.9852776423889412, -0.1709618887603012}, // k=14 + {0.9831054874312163, -0.1830398879551410}, // k=15 + {0.9807852804032304, -0.1950903220161282}, // k=16 + {0.9783173707196277, -0.2071113761922186}, // k=17 + {0.9757021300385286, -0.2191012401568698}, // k=18 + {0.9729399522055602, -0.2310581082806711}, // k=19 + {0.9700312531945440, -0.2429801799032639}, // k=20 + {0.9669764710448521, -0.2548656596045146}, // k=21 + {0.9637760657954398, -0.2667127574748984}, // k=22 + {0.9604305194155658, -0.2785196893850531}, // k=23 + {0.9569403357322088, -0.2902846772544623}, // k=24 + {0.9533060403541939, -0.3020059493192281}, // k=25 + {0.9495281805930367, -0.3136817403988915}, // k=26 + {0.9456073253805213, -0.3253102921622629}, // k=27 + {0.9415440651830208, -0.3368898533922201}, // k=28 + {0.9373390119125750, -0.3484186802494346}, // k=29 + {0.9329927988347390, -0.3598950365349881}, // k=30 + {0.9285060804732156, -0.3713171939518375}, // k=31 + {0.9238795325112867, -0.3826834323650898}, // k=32 + {0.9191138516900578, -0.3939920400610481}, // k=33 + {0.9142097557035307, -0.4052413140049899}, // k=34 + {0.9091679830905224, -0.4164295600976372}, // k=35 + {0.9039892931234433, -0.4275550934302821}, // k=36 + {0.8986744656939538, -0.4386162385385277}, // k=37 + {0.8932243011955153, -0.4496113296546065}, // k=38 + {0.8876396204028539, -0.4605387109582400}, // k=39 + {0.8819212643483550, -0.4713967368259976}, // k=40 + {0.8760700941954066, -0.4821837720791227}, // k=41 + {0.8700869911087115, -0.4928981922297840}, // k=42 + {0.8639728561215868, -0.5035383837257176}, // k=43 + {0.8577286100002721, -0.5141027441932217}, // k=44 + {0.8513551931052652, -0.5245896826784689}, // k=45 + {0.8448535652497071, -0.5349976198870972}, // k=46 + {0.8382247055548381, -0.5453249884220465}, // k=47 + {0.8314696123025452, -0.5555702330196022}, // k=48 + {0.8245893027850253, -0.5657318107836131}, // k=49 + {0.8175848131515837, -0.5758081914178453}, // k=50 + {0.8104571982525948, -0.5857978574564389}, // k=51 + {0.8032075314806449, -0.5956993044924334}, // k=52 + {0.7958369046088836, -0.6055110414043255}, // k=53 + {0.7883464276266063, -0.6152315905806268}, // k=54 + {0.7807372285720945, -0.6248594881423863}, // k=55 + {0.7730104533627370, -0.6343932841636455}, // k=56 + {0.7651672656224590, -0.6438315428897914}, // k=57 + {0.7572088465064846, -0.6531728429537768}, // k=58 + {0.7491363945234594, -0.6624157775901718}, // k=59 + {0.7409511253549591, -0.6715589548470183}, // k=60 + {0.7326542716724128, -0.6806009977954530}, // k=61 + {0.7242470829514670, -0.6895405447370668}, // k=62 + {0.7157308252838186, -0.6983762494089729}, // k=63 + {0.7071067811865476, -0.7071067811865475}, // k=64 + {0.6983762494089729, -0.7157308252838186}, // k=65 + {0.6895405447370669, -0.7242470829514669}, // k=66 + {0.6806009977954531, -0.7326542716724128}, // k=67 + {0.6715589548470183, -0.7409511253549591}, // k=68 + {0.6624157775901718, -0.7491363945234593}, // k=69 + {0.6531728429537768, -0.7572088465064845}, // k=70 + {0.6438315428897915, -0.7651672656224590}, // k=71 + {0.6343932841636455, -0.7730104533627370}, // k=72 + {0.6248594881423865, -0.7807372285720944}, // k=73 + {0.6152315905806268, -0.7883464276266062}, // k=74 + {0.6055110414043255, -0.7958369046088835}, // k=75 + {0.5956993044924335, -0.8032075314806448}, // k=76 + {0.5857978574564389, -0.8104571982525948}, // k=77 + {0.5758081914178453, -0.8175848131515837}, // k=78 + {0.5657318107836132, -0.8245893027850253}, // k=79 + {0.5555702330196023, -0.8314696123025452}, // k=80 + {0.5453249884220465, -0.8382247055548380}, // k=81 + {0.5349976198870973, -0.8448535652497070}, // k=82 + {0.5245896826784688, -0.8513551931052652}, // k=83 + {0.5141027441932217, -0.8577286100002721}, // k=84 + {0.5035383837257176, -0.8639728561215867}, // k=85 + {0.4928981922297841, -0.8700869911087113}, // k=86 + {0.4821837720791228, -0.8760700941954066}, // k=87 + {0.4713967368259978, -0.8819212643483549}, // k=88 + {0.4605387109582400, -0.8876396204028539}, // k=89 + {0.4496113296546066, -0.8932243011955153}, // k=90 + {0.4386162385385277, -0.8986744656939538}, // k=91 + {0.4275550934302822, -0.9039892931234433}, // k=92 + {0.4164295600976373, -0.9091679830905223}, // k=93 + {0.4052413140049899, -0.9142097557035307}, // k=94 + {0.3939920400610481, -0.9191138516900578}, // k=95 + {0.3826834323650898, -0.9238795325112867}, // k=96 + {0.3713171939518376, -0.9285060804732155}, // k=97 + {0.3598950365349883, -0.9329927988347388}, // k=98 + {0.3484186802494345, -0.9373390119125750}, // k=99 + {0.3368898533922201, -0.9415440651830208}, // k=100 + {0.3253102921622630, -0.9456073253805213}, // k=101 + {0.3136817403988916, -0.9495281805930367}, // k=102 + {0.3020059493192282, -0.9533060403541938}, // k=103 + {0.2902846772544623, -0.9569403357322089}, // k=104 + {0.2785196893850531, -0.9604305194155658}, // k=105 + {0.2667127574748984, -0.9637760657954398}, // k=106 + {0.2548656596045146, -0.9669764710448521}, // k=107 + {0.2429801799032640, -0.9700312531945440}, // k=108 + {0.2310581082806713, -0.9729399522055601}, // k=109 + {0.2191012401568698, -0.9757021300385286}, // k=110 + {0.2071113761922186, -0.9783173707196277}, // k=111 + {0.1950903220161283, -0.9807852804032304}, // k=112 + {0.1830398879551411, -0.9831054874312163}, // k=113 + {0.1709618887603014, -0.9852776423889412}, // k=114 + {0.1588581433338614, -0.9873014181578584}, // k=115 + {0.1467304744553617, -0.9891765099647810}, // k=116 + {0.1345807085071262, -0.9909026354277800}, // k=117 + {0.1224106751992163, -0.9924795345987100}, // k=118 + {0.1102222072938832, -0.9939069700023561}, // k=119 + {0.0980171403295608, -0.9951847266721968}, // k=120 + {0.0857973123444399, -0.9963126121827780}, // k=121 + {0.0735645635996675, -0.9972904566786902}, // k=122 + {0.0613207363022086, -0.9981181129001492}, // k=123 + {0.0490676743274181, -0.9987954562051724}, // k=124 + {0.0368072229413590, -0.9993223845883495}, // k=125 + {0.0245412285229123, -0.9996988186962042}, // k=126 + {0.0122715382857199, -0.9999247018391445}, // k=127 + {0.0000000000000001, -1.0000000000000000}, // k=128 + {-0.0122715382857198, -0.9999247018391445}, // k=129 + {-0.0245412285229121, -0.9996988186962042}, // k=130 + {-0.0368072229413589, -0.9993223845883495}, // k=131 + {-0.0490676743274180, -0.9987954562051724}, // k=132 + {-0.0613207363022085, -0.9981181129001492}, // k=133 + {-0.0735645635996673, -0.9972904566786902}, // k=134 + {-0.0857973123444398, -0.9963126121827780}, // k=135 + {-0.0980171403295606, -0.9951847266721969}, // k=136 + {-0.1102222072938831, -0.9939069700023561}, // k=137 + {-0.1224106751992162, -0.9924795345987100}, // k=138 + {-0.1345807085071261, -0.9909026354277800}, // k=139 + {-0.1467304744553616, -0.9891765099647810}, // k=140 + {-0.1588581433338613, -0.9873014181578584}, // k=141 + {-0.1709618887603012, -0.9852776423889412}, // k=142 + {-0.1830398879551409, -0.9831054874312163}, // k=143 + {-0.1950903220161282, -0.9807852804032304}, // k=144 + {-0.2071113761922184, -0.9783173707196277}, // k=145 + {-0.2191012401568697, -0.9757021300385286}, // k=146 + {-0.2310581082806711, -0.9729399522055602}, // k=147 + {-0.2429801799032639, -0.9700312531945440}, // k=148 + {-0.2548656596045145, -0.9669764710448521}, // k=149 + {-0.2667127574748983, -0.9637760657954398}, // k=150 + {-0.2785196893850529, -0.9604305194155659}, // k=151 + {-0.2902846772544622, -0.9569403357322089}, // k=152 + {-0.3020059493192281, -0.9533060403541939}, // k=153 + {-0.3136817403988914, -0.9495281805930367}, // k=154 + {-0.3253102921622629, -0.9456073253805214}, // k=155 + {-0.3368898533922199, -0.9415440651830208}, // k=156 + {-0.3484186802494344, -0.9373390119125750}, // k=157 + {-0.3598950365349882, -0.9329927988347388}, // k=158 + {-0.3713171939518375, -0.9285060804732156}, // k=159 + {-0.3826834323650897, -0.9238795325112867}, // k=160 + {-0.3939920400610480, -0.9191138516900578}, // k=161 + {-0.4052413140049897, -0.9142097557035307}, // k=162 + {-0.4164295600976370, -0.9091679830905225}, // k=163 + {-0.4275550934302819, -0.9039892931234434}, // k=164 + {-0.4386162385385274, -0.8986744656939539}, // k=165 + {-0.4496113296546067, -0.8932243011955152}, // k=166 + {-0.4605387109582401, -0.8876396204028539}, // k=167 + {-0.4713967368259977, -0.8819212643483550}, // k=168 + {-0.4821837720791227, -0.8760700941954066}, // k=169 + {-0.4928981922297840, -0.8700869911087115}, // k=170 + {-0.5035383837257175, -0.8639728561215868}, // k=171 + {-0.5141027441932217, -0.8577286100002721}, // k=172 + {-0.5245896826784687, -0.8513551931052652}, // k=173 + {-0.5349976198870970, -0.8448535652497072}, // k=174 + {-0.5453249884220462, -0.8382247055548382}, // k=175 + {-0.5555702330196020, -0.8314696123025455}, // k=176 + {-0.5657318107836132, -0.8245893027850252}, // k=177 + {-0.5758081914178453, -0.8175848131515837}, // k=178 + {-0.5857978574564389, -0.8104571982525948}, // k=179 + {-0.5956993044924334, -0.8032075314806449}, // k=180 + {-0.6055110414043254, -0.7958369046088836}, // k=181 + {-0.6152315905806267, -0.7883464276266063}, // k=182 + {-0.6248594881423862, -0.7807372285720946}, // k=183 + {-0.6343932841636454, -0.7730104533627371}, // k=184 + {-0.6438315428897913, -0.7651672656224591}, // k=185 + {-0.6531728429537765, -0.7572088465064847}, // k=186 + {-0.6624157775901719, -0.7491363945234593}, // k=187 + {-0.6715589548470184, -0.7409511253549590}, // k=188 + {-0.6806009977954530, -0.7326542716724128}, // k=189 + {-0.6895405447370669, -0.7242470829514669}, // k=190 + {-0.6983762494089728, -0.7157308252838187}, // k=191 + {-0.7071067811865475, -0.7071067811865476}, // k=192 + {-0.7157308252838186, -0.6983762494089729}, // k=193 + {-0.7242470829514668, -0.6895405447370671}, // k=194 + {-0.7326542716724127, -0.6806009977954532}, // k=195 + {-0.7409511253549589, -0.6715589548470186}, // k=196 + {-0.7491363945234591, -0.6624157775901720}, // k=197 + {-0.7572088465064846, -0.6531728429537766}, // k=198 + {-0.7651672656224590, -0.6438315428897914}, // k=199 + {-0.7730104533627370, -0.6343932841636455}, // k=200 + {-0.7807372285720945, -0.6248594881423863}, // k=201 + {-0.7883464276266062, -0.6152315905806269}, // k=202 + {-0.7958369046088835, -0.6055110414043257}, // k=203 + {-0.8032075314806448, -0.5956993044924335}, // k=204 + {-0.8104571982525947, -0.5857978574564390}, // k=205 + {-0.8175848131515836, -0.5758081914178454}, // k=206 + {-0.8245893027850251, -0.5657318107836135}, // k=207 + {-0.8314696123025453, -0.5555702330196022}, // k=208 + {-0.8382247055548381, -0.5453249884220464}, // k=209 + {-0.8448535652497071, -0.5349976198870972}, // k=210 + {-0.8513551931052652, -0.5245896826784689}, // k=211 + {-0.8577286100002720, -0.5141027441932218}, // k=212 + {-0.8639728561215867, -0.5035383837257177}, // k=213 + {-0.8700869911087113, -0.4928981922297841}, // k=214 + {-0.8760700941954065, -0.4821837720791229}, // k=215 + {-0.8819212643483549, -0.4713967368259979}, // k=216 + {-0.8876396204028538, -0.4605387109582402}, // k=217 + {-0.8932243011955152, -0.4496113296546069}, // k=218 + {-0.8986744656939539, -0.4386162385385275}, // k=219 + {-0.9039892931234433, -0.4275550934302820}, // k=220 + {-0.9091679830905224, -0.4164295600976372}, // k=221 + {-0.9142097557035307, -0.4052413140049899}, // k=222 + {-0.9191138516900578, -0.3939920400610482}, // k=223 + {-0.9238795325112867, -0.3826834323650899}, // k=224 + {-0.9285060804732155, -0.3713171939518377}, // k=225 + {-0.9329927988347388, -0.3598950365349883}, // k=226 + {-0.9373390119125748, -0.3484186802494348}, // k=227 + {-0.9415440651830207, -0.3368898533922203}, // k=228 + {-0.9456073253805212, -0.3253102921622633}, // k=229 + {-0.9495281805930367, -0.3136817403988914}, // k=230 + {-0.9533060403541939, -0.3020059493192280}, // k=231 + {-0.9569403357322088, -0.2902846772544624}, // k=232 + {-0.9604305194155658, -0.2785196893850532}, // k=233 + {-0.9637760657954398, -0.2667127574748985}, // k=234 + {-0.9669764710448521, -0.2548656596045147}, // k=235 + {-0.9700312531945440, -0.2429801799032641}, // k=236 + {-0.9729399522055601, -0.2310581082806713}, // k=237 + {-0.9757021300385285, -0.2191012401568700}, // k=238 + {-0.9783173707196275, -0.2071113761922188}, // k=239 + {-0.9807852804032304, -0.1950903220161286}, // k=240 + {-0.9831054874312163, -0.1830398879551409}, // k=241 + {-0.9852776423889412, -0.1709618887603012}, // k=242 + {-0.9873014181578584, -0.1588581433338615}, // k=243 + {-0.9891765099647810, -0.1467304744553618}, // k=244 + {-0.9909026354277800, -0.1345807085071263}, // k=245 + {-0.9924795345987100, -0.1224106751992163}, // k=246 + {-0.9939069700023561, -0.1102222072938832}, // k=247 + {-0.9951847266721968, -0.0980171403295608}, // k=248 + {-0.9963126121827780, -0.0857973123444402}, // k=249 + {-0.9972904566786902, -0.0735645635996677}, // k=250 + {-0.9981181129001492, -0.0613207363022085}, // k=251 + {-0.9987954562051724, -0.0490676743274180}, // k=252 + {-0.9993223845883495, -0.0368072229413588}, // k=253 + {-0.9996988186962042, -0.0245412285229123}, // k=254 + {-0.9999247018391445, -0.0122715382857200}, // k=255 + {-1.0000000000000000, -0.0000000000000001}, // k=256 + {-0.9999247018391445, 0.0122715382857198}, // k=257 + {-0.9996988186962042, 0.0245412285229121}, // k=258 + {-0.9993223845883495, 0.0368072229413586}, // k=259 + {-0.9987954562051724, 0.0490676743274177}, // k=260 + {-0.9981181129001492, 0.0613207363022082}, // k=261 + {-0.9972904566786902, 0.0735645635996675}, // k=262 + {-0.9963126121827780, 0.0857973123444399}, // k=263 + {-0.9951847266721969, 0.0980171403295606}, // k=264 + {-0.9939069700023561, 0.1102222072938830}, // k=265 + {-0.9924795345987100, 0.1224106751992161}, // k=266 + {-0.9909026354277800, 0.1345807085071261}, // k=267 + {-0.9891765099647810, 0.1467304744553616}, // k=268 + {-0.9873014181578584, 0.1588581433338612}, // k=269 + {-0.9852776423889413, 0.1709618887603010}, // k=270 + {-0.9831054874312164, 0.1830398879551406}, // k=271 + {-0.9807852804032304, 0.1950903220161284}, // k=272 + {-0.9783173707196277, 0.2071113761922186}, // k=273 + {-0.9757021300385286, 0.2191012401568698}, // k=274 + {-0.9729399522055602, 0.2310581082806711}, // k=275 + {-0.9700312531945440, 0.2429801799032638}, // k=276 + {-0.9669764710448522, 0.2548656596045145}, // k=277 + {-0.9637760657954400, 0.2667127574748983}, // k=278 + {-0.9604305194155659, 0.2785196893850529}, // k=279 + {-0.9569403357322089, 0.2902846772544621}, // k=280 + {-0.9533060403541940, 0.3020059493192278}, // k=281 + {-0.9495281805930368, 0.3136817403988912}, // k=282 + {-0.9456073253805213, 0.3253102921622630}, // k=283 + {-0.9415440651830208, 0.3368898533922201}, // k=284 + {-0.9373390119125750, 0.3484186802494346}, // k=285 + {-0.9329927988347390, 0.3598950365349881}, // k=286 + {-0.9285060804732156, 0.3713171939518374}, // k=287 + {-0.9238795325112868, 0.3826834323650897}, // k=288 + {-0.9191138516900578, 0.3939920400610479}, // k=289 + {-0.9142097557035307, 0.4052413140049897}, // k=290 + {-0.9091679830905225, 0.4164295600976369}, // k=291 + {-0.9039892931234434, 0.4275550934302818}, // k=292 + {-0.8986744656939540, 0.4386162385385273}, // k=293 + {-0.8932243011955153, 0.4496113296546067}, // k=294 + {-0.8876396204028539, 0.4605387109582401}, // k=295 + {-0.8819212643483550, 0.4713967368259976}, // k=296 + {-0.8760700941954066, 0.4821837720791227}, // k=297 + {-0.8700869911087115, 0.4928981922297839}, // k=298 + {-0.8639728561215868, 0.5035383837257175}, // k=299 + {-0.8577286100002721, 0.5141027441932216}, // k=300 + {-0.8513551931052653, 0.5245896826784687}, // k=301 + {-0.8448535652497072, 0.5349976198870969}, // k=302 + {-0.8382247055548382, 0.5453249884220461}, // k=303 + {-0.8314696123025455, 0.5555702330196020}, // k=304 + {-0.8245893027850253, 0.5657318107836132}, // k=305 + {-0.8175848131515837, 0.5758081914178453}, // k=306 + {-0.8104571982525948, 0.5857978574564389}, // k=307 + {-0.8032075314806449, 0.5956993044924332}, // k=308 + {-0.7958369046088836, 0.6055110414043254}, // k=309 + {-0.7883464276266063, 0.6152315905806267}, // k=310 + {-0.7807372285720946, 0.6248594881423862}, // k=311 + {-0.7730104533627371, 0.6343932841636453}, // k=312 + {-0.7651672656224591, 0.6438315428897913}, // k=313 + {-0.7572088465064848, 0.6531728429537765}, // k=314 + {-0.7491363945234593, 0.6624157775901718}, // k=315 + {-0.7409511253549591, 0.6715589548470184}, // k=316 + {-0.7326542716724128, 0.6806009977954530}, // k=317 + {-0.7242470829514670, 0.6895405447370668}, // k=318 + {-0.7157308252838187, 0.6983762494089728}, // k=319 + {-0.7071067811865477, 0.7071067811865475}, // k=320 + {-0.6983762494089730, 0.7157308252838185}, // k=321 + {-0.6895405447370671, 0.7242470829514668}, // k=322 + {-0.6806009977954532, 0.7326542716724126}, // k=323 + {-0.6715589548470187, 0.7409511253549589}, // k=324 + {-0.6624157775901720, 0.7491363945234590}, // k=325 + {-0.6531728429537771, 0.7572088465064842}, // k=326 + {-0.6438315428897915, 0.7651672656224590}, // k=327 + {-0.6343932841636459, 0.7730104533627367}, // k=328 + {-0.6248594881423865, 0.7807372285720944}, // k=329 + {-0.6152315905806273, 0.7883464276266059}, // k=330 + {-0.6055110414043257, 0.7958369046088835}, // k=331 + {-0.5956993044924331, 0.8032075314806451}, // k=332 + {-0.5857978574564391, 0.8104571982525947}, // k=333 + {-0.5758081914178452, 0.8175848131515838}, // k=334 + {-0.5657318107836135, 0.8245893027850251}, // k=335 + {-0.5555702330196022, 0.8314696123025452}, // k=336 + {-0.5453249884220468, 0.8382247055548379}, // k=337 + {-0.5349976198870973, 0.8448535652497070}, // k=338 + {-0.5245896826784694, 0.8513551931052649}, // k=339 + {-0.5141027441932218, 0.8577286100002720}, // k=340 + {-0.5035383837257180, 0.8639728561215865}, // k=341 + {-0.4928981922297842, 0.8700869911087113}, // k=342 + {-0.4821837720791226, 0.8760700941954067}, // k=343 + {-0.4713967368259979, 0.8819212643483549}, // k=344 + {-0.4605387109582399, 0.8876396204028540}, // k=345 + {-0.4496113296546069, 0.8932243011955152}, // k=346 + {-0.4386162385385276, 0.8986744656939538}, // k=347 + {-0.4275550934302825, 0.9039892931234431}, // k=348 + {-0.4164295600976372, 0.9091679830905224}, // k=349 + {-0.4052413140049904, 0.9142097557035305}, // k=350 + {-0.3939920400610482, 0.9191138516900577}, // k=351 + {-0.3826834323650903, 0.9238795325112865}, // k=352 + {-0.3713171939518378, 0.9285060804732155}, // k=353 + {-0.3598950365349879, 0.9329927988347390}, // k=354 + {-0.3484186802494348, 0.9373390119125748}, // k=355 + {-0.3368898533922199, 0.9415440651830208}, // k=356 + {-0.3253102921622633, 0.9456073253805212}, // k=357 + {-0.3136817403988915, 0.9495281805930367}, // k=358 + {-0.3020059493192285, 0.9533060403541938}, // k=359 + {-0.2902846772544624, 0.9569403357322088}, // k=360 + {-0.2785196893850536, 0.9604305194155657}, // k=361 + {-0.2667127574748985, 0.9637760657954398}, // k=362 + {-0.2548656596045143, 0.9669764710448522}, // k=363 + {-0.2429801799032641, 0.9700312531945440}, // k=364 + {-0.2310581082806709, 0.9729399522055602}, // k=365 + {-0.2191012401568701, 0.9757021300385285}, // k=366 + {-0.2071113761922185, 0.9783173707196277}, // k=367 + {-0.1950903220161287, 0.9807852804032303}, // k=368 + {-0.1830398879551410, 0.9831054874312163}, // k=369 + {-0.1709618887603017, 0.9852776423889411}, // k=370 + {-0.1588581433338615, 0.9873014181578583}, // k=371 + {-0.1467304744553623, 0.9891765099647809}, // k=372 + {-0.1345807085071264, 0.9909026354277800}, // k=373 + {-0.1224106751992160, 0.9924795345987101}, // k=374 + {-0.1102222072938833, 0.9939069700023561}, // k=375 + {-0.0980171403295605, 0.9951847266721969}, // k=376 + {-0.0857973123444402, 0.9963126121827780}, // k=377 + {-0.0735645635996674, 0.9972904566786902}, // k=378 + {-0.0613207363022090, 0.9981181129001492}, // k=379 + {-0.0490676743274180, 0.9987954562051724}, // k=380 + {-0.0368072229413593, 0.9993223845883494}, // k=381 + {-0.0245412285229124, 0.9996988186962042}, // k=382 + {-0.0122715382857205, 0.9999247018391445}, // k=383 + {-0.0000000000000002, 1.0000000000000000}, // k=384 + {0.0122715382857201, 0.9999247018391445}, // k=385 + {0.0245412285229120, 0.9996988186962042}, // k=386 + {0.0368072229413590, 0.9993223845883495}, // k=387 + {0.0490676743274177, 0.9987954562051724}, // k=388 + {0.0613207363022086, 0.9981181129001492}, // k=389 + {0.0735645635996670, 0.9972904566786902}, // k=390 + {0.0857973123444399, 0.9963126121827780}, // k=391 + {0.0980171403295601, 0.9951847266721969}, // k=392 + {0.1102222072938829, 0.9939069700023561}, // k=393 + {0.1224106751992156, 0.9924795345987101}, // k=394 + {0.1345807085071260, 0.9909026354277800}, // k=395 + {0.1467304744553619, 0.9891765099647809}, // k=396 + {0.1588581433338612, 0.9873014181578584}, // k=397 + {0.1709618887603013, 0.9852776423889412}, // k=398 + {0.1830398879551406, 0.9831054874312164}, // k=399 + {0.1950903220161283, 0.9807852804032304}, // k=400 + {0.2071113761922181, 0.9783173707196278}, // k=401 + {0.2191012401568697, 0.9757021300385286}, // k=402 + {0.2310581082806706, 0.9729399522055603}, // k=403 + {0.2429801799032638, 0.9700312531945440}, // k=404 + {0.2548656596045140, 0.9669764710448523}, // k=405 + {0.2667127574748982, 0.9637760657954400}, // k=406 + {0.2785196893850533, 0.9604305194155658}, // k=407 + {0.2902846772544621, 0.9569403357322089}, // k=408 + {0.3020059493192281, 0.9533060403541939}, // k=409 + {0.3136817403988911, 0.9495281805930368}, // k=410 + {0.3253102921622629, 0.9456073253805213}, // k=411 + {0.3368898533922196, 0.9415440651830209}, // k=412 + {0.3484186802494345, 0.9373390119125750}, // k=413 + {0.3598950365349876, 0.9329927988347391}, // k=414 + {0.3713171939518374, 0.9285060804732156}, // k=415 + {0.3826834323650900, 0.9238795325112866}, // k=416 + {0.3939920400610479, 0.9191138516900579}, // k=417 + {0.4052413140049900, 0.9142097557035306}, // k=418 + {0.4164295600976369, 0.9091679830905225}, // k=419 + {0.4275550934302821, 0.9039892931234433}, // k=420 + {0.4386162385385273, 0.8986744656939540}, // k=421 + {0.4496113296546066, 0.8932243011955153}, // k=422 + {0.4605387109582396, 0.8876396204028542}, // k=423 + {0.4713967368259976, 0.8819212643483550}, // k=424 + {0.4821837720791222, 0.8760700941954069}, // k=425 + {0.4928981922297839, 0.8700869911087115}, // k=426 + {0.5035383837257178, 0.8639728561215866}, // k=427 + {0.5141027441932216, 0.8577286100002722}, // k=428 + {0.5245896826784691, 0.8513551931052651}, // k=429 + {0.5349976198870969, 0.8448535652497072}, // k=430 + {0.5453249884220465, 0.8382247055548380}, // k=431 + {0.5555702330196018, 0.8314696123025455}, // k=432 + {0.5657318107836131, 0.8245893027850253}, // k=433 + {0.5758081914178449, 0.8175848131515840}, // k=434 + {0.5857978574564388, 0.8104571982525949}, // k=435 + {0.5956993044924329, 0.8032075314806453}, // k=436 + {0.6055110414043253, 0.7958369046088837}, // k=437 + {0.6152315905806270, 0.7883464276266061}, // k=438 + {0.6248594881423861, 0.7807372285720946}, // k=439 + {0.6343932841636456, 0.7730104533627369}, // k=440 + {0.6438315428897912, 0.7651672656224592}, // k=441 + {0.6531728429537768, 0.7572088465064846}, // k=442 + {0.6624157775901715, 0.7491363945234596}, // k=443 + {0.6715589548470183, 0.7409511253549591}, // k=444 + {0.6806009977954527, 0.7326542716724131}, // k=445 + {0.6895405447370668, 0.7242470829514670}, // k=446 + {0.6983762494089724, 0.7157308252838190}, // k=447 + {0.7071067811865474, 0.7071067811865477}, // k=448 + {0.7157308252838188, 0.6983762494089727}, // k=449 + {0.7242470829514667, 0.6895405447370672}, // k=450 + {0.7326542716724129, 0.6806009977954530}, // k=451 + {0.7409511253549589, 0.6715589548470187}, // k=452 + {0.7491363945234594, 0.6624157775901718}, // k=453 + {0.7572088465064842, 0.6531728429537771}, // k=454 + {0.7651672656224588, 0.6438315428897915}, // k=455 + {0.7730104533627367, 0.6343932841636459}, // k=456 + {0.7807372285720944, 0.6248594881423865}, // k=457 + {0.7883464276266059, 0.6152315905806274}, // k=458 + {0.7958369046088833, 0.6055110414043257}, // k=459 + {0.8032075314806451, 0.5956993044924332}, // k=460 + {0.8104571982525947, 0.5857978574564391}, // k=461 + {0.8175848131515837, 0.5758081914178452}, // k=462 + {0.8245893027850251, 0.5657318107836136}, // k=463 + {0.8314696123025452, 0.5555702330196022}, // k=464 + {0.8382247055548377, 0.5453249884220468}, // k=465 + {0.8448535652497070, 0.5349976198870973}, // k=466 + {0.8513551931052649, 0.5245896826784694}, // k=467 + {0.8577286100002720, 0.5141027441932219}, // k=468 + {0.8639728561215864, 0.5035383837257181}, // k=469 + {0.8700869911087113, 0.4928981922297843}, // k=470 + {0.8760700941954067, 0.4821837720791226}, // k=471 + {0.8819212643483548, 0.4713967368259979}, // k=472 + {0.8876396204028539, 0.4605387109582399}, // k=473 + {0.8932243011955151, 0.4496113296546070}, // k=474 + {0.8986744656939538, 0.4386162385385277}, // k=475 + {0.9039892931234431, 0.4275550934302825}, // k=476 + {0.9091679830905224, 0.4164295600976373}, // k=477 + {0.9142097557035305, 0.4052413140049904}, // k=478 + {0.9191138516900577, 0.3939920400610483}, // k=479 + {0.9238795325112865, 0.3826834323650904}, // k=480 + {0.9285060804732155, 0.3713171939518378}, // k=481 + {0.9329927988347390, 0.3598950365349880}, // k=482 + {0.9373390119125748, 0.3484186802494349}, // k=483 + {0.9415440651830208, 0.3368898533922200}, // k=484 + {0.9456073253805212, 0.3253102921622634}, // k=485 + {0.9495281805930367, 0.3136817403988915}, // k=486 + {0.9533060403541936, 0.3020059493192286}, // k=487 + {0.9569403357322088, 0.2902846772544625}, // k=488 + {0.9604305194155657, 0.2785196893850537}, // k=489 + {0.9637760657954398, 0.2667127574748986}, // k=490 + {0.9669764710448522, 0.2548656596045144}, // k=491 + {0.9700312531945440, 0.2429801799032642}, // k=492 + {0.9729399522055602, 0.2310581082806710}, // k=493 + {0.9757021300385285, 0.2191012401568702}, // k=494 + {0.9783173707196277, 0.2071113761922185}, // k=495 + {0.9807852804032303, 0.1950903220161287}, // k=496 + {0.9831054874312163, 0.1830398879551410}, // k=497 + {0.9852776423889411, 0.1709618887603018}, // k=498 + {0.9873014181578583, 0.1588581433338616}, // k=499 + {0.9891765099647809, 0.1467304744553624}, // k=500 + {0.9909026354277800, 0.1345807085071264}, // k=501 + {0.9924795345987100, 0.1224106751992160}, // k=502 + {0.9939069700023561, 0.1102222072938834}, // k=503 + {0.9951847266721969, 0.0980171403295605}, // k=504 + {0.9963126121827780, 0.0857973123444403}, // k=505 + {0.9972904566786902, 0.0735645635996674}, // k=506 + {0.9981181129001492, 0.0613207363022091}, // k=507 + {0.9987954562051724, 0.0490676743274181}, // k=508 + {0.9993223845883494, 0.0368072229413594}, // k=509 + {0.9996988186962042, 0.0245412285229124}, // k=510 + {0.9999247018391445, 0.0122715382857206}, // k=511 +}; diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index e2ac814e53..c8383b12fa 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -422,14 +422,11 @@ inline BoutReal randomu() { * i.e. t * t */ template -inline T SQ(const T& t) { +inline auto SQ(const T& t) { return t * t; } -template <> -BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { - return t * t; -} +BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { return t * t; } /*! * Round \p x to the nearest integer diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index 0d2bc0694e..9ea488d8f1 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -805,7 +805,8 @@ bool operator==(const Field3D& a, const Field3D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return min(abs(a - b)) < 1e-10; + Field3D Sub = a - b; + return min(Sub) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field3D& value) { diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index ca9bdc0397..9578aa0d9d 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -209,7 +209,7 @@ bool operator==(const FieldPerp& a, const FieldPerp& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return (a.getIndex() == b.getIndex()) and (min(abs(a - b)) < 1e-10); + return (a.getIndex() == b.getIndex()) and (min(FieldPerp{abs(a - b)}) < 1e-10); } std::ostream& operator<<(std::ostream& out, const FieldPerp& value) { diff --git a/src/field/gen_fieldops.jinja b/src/field/gen_fieldops.jinja index ecd4e628cc..60f9cbbd7e 100644 --- a/src/field/gen_fieldops.jinja +++ b/src/field/gen_fieldops.jinja @@ -8,6 +8,26 @@ checkData({{lhs.name}}); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if out.field_type == "FieldPerp" %} + auto {{out.name}}_acc = FieldPerpAccessor{ {{out.name}} }; + {% else %} + auto {{out.name}}_acc = FieldAccessor({{out.name}}); + {% endif %} + {% if lhs.field_type == "FieldPerp" %} + auto {{lhs.name}}_acc = FieldPerpAccessor{ {{lhs.name}} }; + {% elif lhs.field_type == "BoutReal" %} + {% else %} + auto {{lhs.name}}_acc = FieldAccessor({{lhs.name}}); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if out == "Field3D" %} {% if lhs == rhs == "Field3D" %} {{out.name}}.setRegion({{lhs.name}}.getMesh()->getCommonRegion({{lhs.name}}.getRegionID(), @@ -20,45 +40,98 @@ {% endif %} {% if (out == "Field3D") and ((lhs == "Field2D") or (rhs =="Field2D")) %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}_acc.mesh_nz; + {% else %} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); + {% endif %} {% if (lhs == "Field2D") %} {{region_loop}}({{index_var}}, {{lhs.name}}.getRegion({{region_name}})) { {% else %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { {% endif %} - const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} * tmp; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} {{operator}} {{rhs.mixed_index_acc}}; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif out == "FieldPerp" and (lhs == "Field2D" or lhs == "Field3D" or rhs == "Field2D" or rhs == "Field3D")%} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); {{region_loop}}({{index_var}}, {{out.name}}.getRegion({{region_name}})) { - int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}_acc.getIndex(); + {% else %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + ; // DONE2 + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, result_acc.nz, yind); + {% else %} + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% endif %} {% if lhs != "FieldPerp" %} - {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.base_index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% endif %} {% else %} - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.base_index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% endif %} {% endif %} - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (rhs == "BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} * tmp; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} * tmp; + {% else %} + {{out.index}} = {{lhs.index}} * tmp; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData({{out.name}}); @@ -84,49 +157,102 @@ checkData(*this); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if lhs.field_type == "FieldPerp" %} + auto this_acc = FieldPerpAccessor{(*this)}; + {% else %} + auto this_acc = FieldAccessor(*this); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if lhs == rhs == "Field3D" %} regionID = fieldmesh->getCommonRegion(regionID, {{rhs.name}}.regionID); {% endif %} - {% if (lhs == "Field3D") and (rhs =="Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = fieldmesh->LocalNz; + {% endif %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif lhs == "FieldPerp" and (rhs == "Field3D" or rhs == "Field2D")%} Mesh *localmesh = this->getMesh(); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = this->getIndex(); + {% endif %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - int yind = this->getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, yind); + this_acc[{{index_var}}] {{operator}}= {{rhs.base_index_acc}}; + {% else %} + int yind = this->getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif rhs == "FieldPerp" and (lhs == "Field3D" or lhs == "Field2D")%} Mesh *localmesh = this->getMesh(); {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - int yind = {{rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + this_acc[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% else %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (lhs == "Field3D" or lhs == "Field2D") and (rhs =="BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] *= tmp; + {% else %} (*this)[{{index_var}}] *= tmp; - } + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData(*this); diff --git a/src/field/gen_fieldops.py b/src/field/gen_fieldops.py index 29631ff7aa..bf06a8ea5c 100755 --- a/src/field/gen_fieldops.py +++ b/src/field/gen_fieldops.py @@ -132,6 +132,17 @@ def index(self): else: return "{self.name}[{self.index_var}]".format(self=self) + @property + def index_acc(self): + """Returns "_acc[{index_var}]" for an accessor-based index, except if + field_type is BoutReal, in which case just returns "" + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def mixed_index(self): """Returns "[{index_var} + {jz_var}]" if field_type is Field3D, @@ -147,6 +158,21 @@ def mixed_index(self): else: # Field2D return "{self.name}[{self.index_var}]".format(self=self) + @property + def mixed_index_acc(self): + """Returns "_acc[{index_var} + {jz_var}]" for an accessor if field_type + is Field3D, self.index if Field2D or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}_acc".format(self=self) + elif self.field_type == "Field3D": + return "{self.name}_acc[{self.mixed_base_ind_var} + {self.jz_var}]".format( + self=self + ) + else: # Field2D + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def base_index(self): """Returns "[{mixed_base_ind_var}]" if field_type is Field3D, Field2D or FieldPerp @@ -158,6 +184,18 @@ def base_index(self): else: return "{self.name}[{self.mixed_base_ind_var}]".format(self=self) + @property + def base_index_acc(self): + """Returns "_acc[{mixed_base_ind_var}]" for an accessor if field_type is + Field3D, Field2D or FieldPerp or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.mixed_base_ind_var}]".format(self=self) + + def __eq__(self, other): try: return self.field_type == other.field_type @@ -198,11 +236,11 @@ def returnType(f1, f2): ) # By default use OpenMP enabled loops but allow to disable parser.add_argument( - "--no-openmp", - action="store_false", - default=False, - dest="noOpenMP", - help="Don't use OpenMP compatible loops", + "--loop-exec", + default="openmp", + dest="loop_exec", + choices=["serial", "openmp", "raja"], + help="Choose the loop execution method. Default is OpenMP", ) args = parser.parse_args() @@ -213,10 +251,16 @@ def returnType(f1, f2): mixed_base_ind_var = "base_ind" region_name = '"RGN_ALL"' - if args.noOpenMP: + if args.loop_exec == "openmp": + region_loop = "BOUT_FOR" + elif args.loop_exec == "raja": + region_loop = "BOUT_FOR_RAJA" + header += "#include \n" + header += "#include \n" + elif args.loop_exec == "serial": region_loop = "BOUT_FOR_SERIAL" else: - region_loop = "BOUT_FOR" + raise ValueError("Unknown loop execution method") # Declare what fields we currently support: # Field perp is currently missing diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 6b778acee3..022fedbd17 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -6,872 +6,9 @@ #include #include -// Provide the C++ wrapper for multiplication of Field3D and Field3D -Field3D operator*(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field3D -Field3D& Field3D::operator*=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field3D and Field3D -Field3D operator/(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field3D -Field3D& Field3D::operator/=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -// Provide the C++ wrapper for addition of Field3D and Field3D -Field3D operator+(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by addition with Field3D -Field3D& Field3D::operator+=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field3D and Field3D -Field3D operator-(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field3D -Field3D& Field3D::operator-=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for multiplication of Field3D and Field2D -Field3D operator*(const Field3D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * rhs[index]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field2D -Field3D& Field3D::operator*=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field3D and Field2D -Field3D operator/(const Field3D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * tmp; - } - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field2D -Field3D& Field3D::operator/=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= tmp; - } - } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -// Provide the C++ wrapper for addition of Field3D and Field2D -Field3D operator+(const Field3D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] + rhs[index]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by addition with Field2D -Field3D& Field3D::operator+=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] += rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field3D and Field2D -Field3D operator-(const Field3D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] - rhs[index]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field2D -Field3D& Field3D::operator-=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] -= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for multiplication of Field3D and FieldPerp -FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - FieldPerp result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, result.getRegion("RGN_ALL")) { - int yind = rhs.getIndex(); - const auto base_ind = localmesh->indPerpto3D(index, yind); - result[index] = lhs[base_ind] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for division of Field3D and FieldPerp -FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - FieldPerp result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, result.getRegion("RGN_ALL")) { - int yind = rhs.getIndex(); - const auto base_ind = localmesh->indPerpto3D(index, yind); - result[index] = lhs[base_ind] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of Field3D and FieldPerp -FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - FieldPerp result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, result.getRegion("RGN_ALL")) { - int yind = rhs.getIndex(); - const auto base_ind = localmesh->indPerpto3D(index, yind); - result[index] = lhs[base_ind] + rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of Field3D and FieldPerp -FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - FieldPerp result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, result.getRegion("RGN_ALL")) { - int yind = rhs.getIndex(); - const auto base_ind = localmesh->indPerpto3D(index, yind); - result[index] = lhs[base_ind] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for multiplication of Field3D and BoutReal -Field3D operator*(const Field3D& lhs, const BoutReal rhs) { - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with BoutReal -Field3D& Field3D::operator*=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field3D and BoutReal -Field3D operator/(const Field3D& lhs, const BoutReal rhs) { - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with BoutReal -Field3D& Field3D::operator/=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -// Provide the C++ wrapper for addition of Field3D and BoutReal -Field3D operator+(const Field3D& lhs, const BoutReal rhs) { - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by addition with BoutReal -Field3D& Field3D::operator+=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field3D and BoutReal -Field3D operator-(const Field3D& lhs, const BoutReal rhs) { - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with BoutReal -Field3D& Field3D::operator-=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for multiplication of Field2D and Field3D -Field3D operator*(const Field2D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] * rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for division of Field2D and Field3D -Field3D operator/(const Field2D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] / rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of Field2D and Field3D -Field3D operator+(const Field2D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] + rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of Field2D and Field3D -Field3D operator-(const Field2D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] - rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for multiplication of Field2D and Field2D -Field2D operator*(const Field2D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field2D by multiplication with Field2D -Field2D& Field2D::operator*=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field2D and Field2D -Field2D operator/(const Field2D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field2D by division with Field2D -Field2D& Field2D::operator/=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -// Provide the C++ wrapper for addition of Field2D and Field2D -Field2D operator+(const Field2D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field2D by addition with Field2D -Field2D& Field2D::operator+=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field2D and Field2D -Field2D operator-(const Field2D& lhs, const Field2D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field2D by subtraction with Field2D -Field2D& Field2D::operator-=(const Field2D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for multiplication of Field2D and FieldPerp -FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { +// Provide the C++ wrapper for multiplication of Field3D and FieldPerp +FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -890,8 +27,9 @@ FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for division of Field2D and FieldPerp -FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { +// Provide the C++ wrapper for division of Field3D and FieldPerp +FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -910,8 +48,9 @@ FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for addition of Field2D and FieldPerp -FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { +// Provide the C++ wrapper for addition of Field3D and FieldPerp +FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -930,8 +69,9 @@ FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for subtraction of Field2D and FieldPerp -FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { +// Provide the C++ wrapper for subtraction of Field3D and FieldPerp +FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -950,146 +90,93 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for multiplication of Field2D and BoutReal -Field2D operator*(const Field2D& lhs, const BoutReal rhs) { +// Provide the C++ wrapper for multiplication of Field2D and FieldPerp +FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - Field2D result{emptyFrom(lhs)}; + FieldPerp result{emptyFrom(rhs)}; checkData(lhs); checkData(rhs); - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; + Mesh* localmesh = lhs.getMesh(); + + BOUT_FOR(index, result.getRegion("RGN_ALL")) { + int yind = rhs.getIndex(); + const auto base_ind = localmesh->indPerpto3D(index, yind); + result[index] = lhs[base_ind] * rhs[index]; } checkData(result); return result; } -// Provide the C++ operator to update Field2D by multiplication with BoutReal -Field2D& Field2D::operator*=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field2D and BoutReal -Field2D operator/(const Field2D& lhs, const BoutReal rhs) { +// Provide the C++ wrapper for division of Field2D and FieldPerp +FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - Field2D result{emptyFrom(lhs)}; + FieldPerp result{emptyFrom(rhs)}; checkData(lhs); checkData(rhs); - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; + Mesh* localmesh = lhs.getMesh(); + + BOUT_FOR(index, result.getRegion("RGN_ALL")) { + int yind = rhs.getIndex(); + const auto base_ind = localmesh->indPerpto3D(index, yind); + result[index] = lhs[base_ind] / rhs[index]; } checkData(result); return result; } -// Provide the C++ operator to update Field2D by division with BoutReal -Field2D& Field2D::operator/=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -// Provide the C++ wrapper for addition of Field2D and BoutReal -Field2D operator+(const Field2D& lhs, const BoutReal rhs) { +// Provide the C++ wrapper for addition of Field2D and FieldPerp +FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - Field2D result{emptyFrom(lhs)}; + FieldPerp result{emptyFrom(rhs)}; checkData(lhs); checkData(rhs); - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; + Mesh* localmesh = lhs.getMesh(); + + BOUT_FOR(index, result.getRegion("RGN_ALL")) { + int yind = rhs.getIndex(); + const auto base_ind = localmesh->indPerpto3D(index, yind); + result[index] = lhs[base_ind] + rhs[index]; } checkData(result); return result; } -// Provide the C++ operator to update Field2D by addition with BoutReal -Field2D& Field2D::operator+=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field2D and BoutReal -Field2D operator-(const Field2D& lhs, const BoutReal rhs) { +// Provide the C++ wrapper for subtraction of Field2D and FieldPerp +FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - Field2D result{emptyFrom(lhs)}; + FieldPerp result{emptyFrom(rhs)}; checkData(lhs); checkData(rhs); - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; + Mesh* localmesh = lhs.getMesh(); + + BOUT_FOR(index, result.getRegion("RGN_ALL")) { + int yind = rhs.getIndex(); + const auto base_ind = localmesh->indPerpto3D(index, yind); + result[index] = lhs[base_ind] - rhs[index]; } checkData(result); return result; } -// Provide the C++ operator to update Field2D by subtraction with BoutReal -Field2D& Field2D::operator-=(const BoutReal rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1110,6 +197,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field3D FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1136,6 +224,7 @@ FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field3D FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1156,6 +245,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field3D FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1182,6 +272,7 @@ FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field3D FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1202,6 +293,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field3D FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1228,6 +320,7 @@ FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field3D FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1248,6 +341,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field3D FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1274,6 +368,7 @@ FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and Field2D FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1294,6 +389,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field2D FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1320,6 +416,7 @@ FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field2D FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1340,6 +437,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field2D FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1366,6 +464,7 @@ FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field2D FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1386,6 +485,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field2D FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1412,6 +512,7 @@ FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field2D FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1432,6 +533,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field2D FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1458,6 +560,7 @@ FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and FieldPerp FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1474,6 +577,7 @@ FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with FieldPerp FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1494,6 +598,7 @@ FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { // Provide the C++ wrapper for division of FieldPerp and FieldPerp FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1510,6 +615,7 @@ FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by division with FieldPerp FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1530,6 +636,7 @@ FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { // Provide the C++ wrapper for addition of FieldPerp and FieldPerp FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1546,6 +653,7 @@ FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by addition with FieldPerp FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1566,6 +674,7 @@ FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and FieldPerp FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1582,6 +691,7 @@ FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with FieldPerp FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1602,6 +712,7 @@ FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and BoutReal FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1617,6 +728,7 @@ FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by multiplication with BoutReal FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1636,6 +748,7 @@ FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of FieldPerp and BoutReal FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1652,6 +765,7 @@ FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by division with BoutReal FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1671,6 +785,7 @@ FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of FieldPerp and BoutReal FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1686,6 +801,7 @@ FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by addition with BoutReal FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1705,6 +821,7 @@ FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and BoutReal FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1720,6 +837,7 @@ FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by subtraction with BoutReal FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1737,136 +855,9 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } -// Provide the C++ wrapper for multiplication of BoutReal and Field3D -Field3D operator*(const BoutReal lhs, const Field3D& rhs) { - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for division of BoutReal and Field3D -Field3D operator/(const BoutReal lhs, const Field3D& rhs) { - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of BoutReal and Field3D -Field3D operator+(const BoutReal lhs, const Field3D& rhs) { - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of BoutReal and Field3D -Field3D operator-(const BoutReal lhs, const Field3D& rhs) { - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for multiplication of BoutReal and Field2D -Field2D operator*(const BoutReal lhs, const Field2D& rhs) { - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for division of BoutReal and Field2D -Field2D operator/(const BoutReal lhs, const Field2D& rhs) { - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of BoutReal and Field2D -Field2D operator+(const BoutReal lhs, const Field2D& rhs) { - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of BoutReal and Field2D -Field2D operator-(const BoutReal lhs, const Field2D& rhs) { - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} - // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1882,6 +873,7 @@ FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of BoutReal and FieldPerp FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1897,6 +889,7 @@ FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of BoutReal and FieldPerp FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1912,6 +905,7 @@ FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and FieldPerp FieldPerp operator-(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 5f34e2af02..95409963f6 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -187,7 +187,7 @@ Field3D Div(const Vector3D& v, CELL_LOC outloc, const std::string& method) { Vector3D vcn = v; vcn.toContravariant(); - auto vcnJy = vcn.y.getCoordinates()->J * vcn.y; + Field3D vcnJy = vcn.y.getCoordinates()->J * vcn.y; if (v.y.hasParallelSlices()) { // If v.y has parallel slices then we are using ShiftedMetric (with // mesh:calcParallelSlices_on_communicate=true) or FCI, so we should calculate diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index e6f68d850d..203b0c0abd 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -269,8 +269,9 @@ Field3D LaplaceNaulin::solve(const Field3D& rhs, const Field3D& x0) { delp2solver->setCoefC2(C2coef_DC); // Use this below to normalize error for relative error estimate + Field3D SQField = SQ(rhsOverD); BoutReal RMS_rhsOverD = sqrt(mean( - SQ(rhsOverD), true, + SQField, true, "RGN_NOBNDRY")); // use sqrt(mean(SQ)) to make sure we do not divide by zero at a point BoutReal error_rel = 1e20, error_abs = 1e20, last_error = error_abs; diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 34c524d1e7..0139d21fd7 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -23,6 +23,8 @@ #include "parallel/fci.hxx" #include "parallel/shiftedmetricinterp.hxx" +#include "bout/coordinates_accessor.hxx" + // use anonymous namespace so this utility function is not available outside this file namespace { template @@ -551,7 +553,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) transform.get()); // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(J - Jcalc))); + Field2D diff = J - Jcalc; + output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(diff))); communicate(J); @@ -576,7 +579,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(diff))); } // Check Bxy @@ -757,8 +761,9 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, transform.get()); + FieldMetric diff = J - Jcalc; // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is %e\n", max(abs(J - Jcalc))); + output_warn.write("\tMaximum difference in J is %e\n", max(abs(diff))); // Re-evaluate Bxy using new J Bxy = sqrt(g_22) / J; @@ -783,7 +788,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(diff))); } // Check Bxy @@ -1027,7 +1033,7 @@ int Coordinates::geometry(bool recalculate_staggered, G3_23 = 0.5 * g13 * (DDZ(g_12) + DDY(g_13) - DDX(g_23)) + 0.5 * g23 * DDZ(g_22) + 0.5 * g33 * DDY(g_33); - auto tmp = J * g12; + FieldMetric tmp = J * g12; communicate(tmp); G1 = (DDX(J * g11) + DDY(tmp) + DDZ(J * g13)) / J; tmp = J * g22; @@ -1096,7 +1102,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x" + suffix, 0.0, false, location)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = @@ -1150,7 +1156,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x", 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = @@ -1203,10 +1209,11 @@ int Coordinates::geometry(bool recalculate_staggered, localmesh->recalculateStaggeredCoordinates(); } - // Invalidate and recalculate cached variables + // Invalidate and recalculate cached variables and any accessor zlength_cache.reset(); Grad2_par2_DDY_invSgCache.clear(); invSgCache.reset(); + CoordinatesAccessor::clear(this); return 0; } @@ -1259,15 +1266,15 @@ int Coordinates::calcCovariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tLocal maximum error in off-diagonal inversion is {:e}\n", maxerr); @@ -1315,15 +1322,15 @@ int Coordinates::calcContravariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tMaximum error in off-diagonal inversion is {:e}\n", maxerr); return 0; @@ -1336,8 +1343,8 @@ int Coordinates::jacobian() { const bool extrapolate_x = not localmesh->sourceHasXBoundaryGuards(); const bool extrapolate_y = not localmesh->sourceHasYBoundaryGuards(); - auto g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 - g22 * g13 * g13 - - g33 * g12 * g12; + auto g = FieldMetric{g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 + - g22 * g13 * g13 - g33 * g12 * g12}; // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); diff --git a/src/mesh/coordinates_accessor.cxx b/src/mesh/coordinates_accessor.cxx index aff546c2b0..196234d999 100644 --- a/src/mesh/coordinates_accessor.cxx +++ b/src/mesh/coordinates_accessor.cxx @@ -40,8 +40,9 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { // Copy data from Coordinates variable into data array // Uses the symbol to look up the corresponding Offset -#define COPY_STRIPE1(symbol) \ - data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; +#define COPY_STRIPE1(symbol) \ + if (coords->symbol.isAllocated()) \ + data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; // Implement copy for each argument #define COPY_STRIPE(...) \ @@ -54,10 +55,15 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { COPY_STRIPE(d1_dx, d1_dy, d1_dz); COPY_STRIPE(J); - data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Byup)] = coords->Bxy.yup()[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = - coords->Bxy.ydown()[ind]; + if (coords->Bxy.isAllocated()) { + data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; + if (coords->Bxy.yup().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Byup)] = + coords->Bxy.yup()[ind]; + if (coords->Bxy.ydown().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = + coords->Bxy.ydown()[ind]; + } COPY_STRIPE(G1, G3); COPY_STRIPE(g11, g12, g13, g22, g23, g33); diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 382052047d..40085eaf92 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -17,6 +17,11 @@ #include +#if BOUT_HAS_CUDA +#include +#include +#endif + ShiftedMetric::ShiftedMetric(Mesh& m, CELL_LOC location_in, Field2D zShift_, BoutReal zlength_in, Options* opt) : ParallelTransform(m, opt), location(location_in), zShift(std::move(zShift_)), @@ -38,8 +43,8 @@ void ShiftedMetric::checkInputGrid() { "Should be 'shiftedmetric'."); } } // else: parallel_transform variable not found in grid input, indicates older input - // file or grid from options so must rely on the user having ensured the type is - // correct + // file or grid from options so must rely on the user having ensured the type is + // correct } void ShiftedMetric::outputVars(Options& output_options) { @@ -222,6 +227,260 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } +#if BOUT_HAS_CUDA +// Bit-reversal +__device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { + unsigned int result = 0; +#pragma unroll + for (unsigned int i = 0; i < log2n; i++) { + result = (result << 1) | (x & 1); + x >>= 1; + } + return result; +} + +// Block-level cooperative FFT +// Multiple threads cooperate on each FFT using shared memory +template +__global__ void fft_block_cooperative(const BoutReal** __restrict__ in, + BoutReal** __restrict__ out, + const double2** __restrict__ blocks_phs, + const int nbatches, const int nblocks) { + + constexpr int LOG2_NZ = __builtin_ctz(NZ); + constexpr double INV_NZ = 1.0 / (double)NZ; + constexpr int NMODES = (NZ / 2) + 1; + + // Shared memory for FFTS_PER_BLOCK FFTs + // Each FFT needs NZ complex values + __shared__ double2 shared_fft[FFTS_PER_BLOCK][NZ]; + + // Select twiddles based on size + const double2* twiddles; + if constexpr (NZ == 16) { + twiddles = c_twiddle_16; + } else if constexpr (NZ == 64) { + twiddles = c_twiddle_64; + } else if constexpr (NZ == 128) { + twiddles = c_twiddle_128; + } else if constexpr (NZ == 256) { + twiddles = c_twiddle_256; + } else if constexpr (NZ == 512) { + twiddles = c_twiddle_512; + } else { + static_assert(NZ == 16 || NZ == 64 || NZ == 128 || NZ == 256 || NZ == 512, + "Unsupported NZ"); + } + + // Each block processes FFTS_PER_BLOCK FFTs + const int fft_id_in_block = + threadIdx.y; // Which FFT this thread works on (0 to FFTS_PER_BLOCK-1) + const int global_fft_id = blockIdx.x * FFTS_PER_BLOCK + fft_id_in_block; + + if (global_fft_id >= nblocks * nbatches) + return; + + const int block = global_fft_id / nbatches; + const int batch = global_fft_id % nbatches; + + const double* __restrict__ in_line = in[block] + batch * NZ; + double* __restrict__ out_line = out[block] + batch * NZ; + const double2* __restrict__ phs = blocks_phs[block]; + + // Thread ID within the FFT computation + const int tid = threadIdx.x; + const int threads_per_fft = blockDim.x; // All threads in x-dimension work on same FFT + + // ===== LOAD INPUT WITH BIT-REVERSAL ===== + // Each thread loads some elements (strided) + for (int i = tid; i < NZ; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + shared_fft[fft_id_in_block][rev_i].x = in_line[i]; + shared_fft[fft_id_in_block][rev_i].y = 0.0; + } + __syncthreads(); + + // ===== FORWARD FFT: Cooley-Tukey DIT in Shared Memory ===== + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + // Each thread processes multiple butterflies + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + // Twiddle factor + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + // Load from shared memory + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + // Butterfly: t = W * bottom + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + // Write back + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // ===== APPLY PHASE SHIFT ===== + for (int k = tid; k < NMODES; k += threads_per_fft) { + const double2 ph = phs[batch * NMODES + k]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * ph.x - imag * ph.y; + shared_fft[fft_id_in_block][k].y = real * ph.y + imag * ph.x; + } + + for (int k = tid + NMODES; k < NZ; k += threads_per_fft) { + const int kk = NZ - k; + const double2 tmp = phs[batch * NMODES + kk]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; + shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; + } + __syncthreads(); + + // ===== INVERSE FFT: Conjugate, FFT, Conjugate ===== + // Conjugate input + for (int i = tid; i < NZ; i += threads_per_fft) { + shared_fft[fft_id_in_block][i].y = -shared_fft[fft_id_in_block][i].y; + } + __syncthreads(); + + // Bit-reverse with standard swap to avoid temp array + // This is tricky but saves memory + for (int i = tid; i < NZ / 2; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + if (i < rev_i) { // Only swap once per pair + double2 temp = shared_fft[fft_id_in_block][i]; + shared_fft[fft_id_in_block][i] = shared_fft[fft_id_in_block][rev_i]; + shared_fft[fft_id_in_block][rev_i] = temp; + } + } + __syncthreads(); + + // Forward FFT again (for inverse) + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // Store output (conjugate and normalize) + for (int i = tid; i < NZ; i += threads_per_fft) { + out_line[i] = shared_fft[fft_id_in_block][i].x * INV_NZ; + } +} + +// Launcher for block-level cooperative FFT +static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, + const double2** phs, int nblocks, int nbatches, + cudaStream_t stream = 0) { + if ((Nz & (Nz - 1)) != 0) { + fprintf(stderr, "Error: Nz=%d must be power of 2\n", Nz); + return; + } + + const int total_ffts = nblocks * nbatches; + + if (Nz == 16) { + constexpr int FFTS_PER_BLOCK = 16; + constexpr int THREADS_PER_FFT = 16; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<16, FFTS_PER_BLOCK> + <<>>(in, out, phs, nbatches, nblocks); + } else if (Nz == 64) { + constexpr int FFTS_PER_BLOCK = 4; + constexpr int THREADS_PER_FFT = 64; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<64, FFTS_PER_BLOCK> + <<>>(in, out, phs, nbatches, nblocks); + + } else if (Nz == 128) { + constexpr int FFTS_PER_BLOCK = 2; + constexpr int THREADS_PER_FFT = 128; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<128, FFTS_PER_BLOCK> + <<>>(in, out, phs, nbatches, nblocks); + + } else if (Nz == 256) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 256; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); + dim3 grid(total_ffts); + + fft_block_cooperative<256, FFTS_PER_BLOCK> + <<>>(in, out, phs, nbatches, nblocks); + + } else if (Nz == 512) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 512; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); + dim3 grid(total_ffts); + + fft_block_cooperative<512, FFTS_PER_BLOCK> + <<>>(in, out, phs, nbatches, nblocks); + } else { + throw std::runtime_error("Unsupported Nz " + std::to_string(Nz) + " for block FFT"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error(std::string("Block FFT failed: ") + cudaGetErrorString(err)); + } +} +#endif + void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { // Cannot calculate parallel slices for field-aligned fields, so return without @@ -231,9 +490,76 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f.splitParallelSlices(); +#if BOUT_HAS_CUDA + auto& region = mesh.getRegion2D("RGN_NOY"); + static size_t nblocks = region.getBlocks().size(); + if (nblocks != region.getBlocks().size()) { + throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); + } + + static struct StreamRAII { + cudaStream_t stream = 0; + StreamRAII() { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } + + cudaStream_t get() const { return stream; } + + void synchronize() const { cudaStreamSynchronize(stream); } + + ~StreamRAII() { cudaStreamDestroy(stream); } + } stream; + + // Vector of Arrays for each phase. + std::vector> blocks_in_phase; + std::vector> blocks_out_phase; + std::vector> phs_in_phase; + for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); f_slice.allocate(); + + size_t block_idx = 0; + int nbatches = + region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; + + Array& blocks_in = blocks_in_phase.emplace_back(nblocks); + Array& blocks_out = blocks_out_phase.emplace_back(nblocks); + Array& phs_in = phs_in_phase.emplace_back(nblocks); + + for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); + block < end; ++block) { + auto idx_s = block->first; + auto idx_e = block->second; + int inner_nbatches = idx_e.ind - idx_s.ind; + if (inner_nbatches != nbatches) { + throw BoutException( + "Non-uniform number of batches in ShiftedMetric::calcParallelSlices"); + } + const int ix = idx_s.x(); + const int iy = idx_s.y(); + const int iy_offset = iy + phase.y_offset; + + blocks_in[block_idx] = &f(ix, iy_offset, 0); + blocks_out[block_idx] = &f_slice(ix, iy_offset, 0); + phs_in[block_idx] = reinterpret_cast(&phase.phase_shift(ix, iy, 0)); + + block_idx++; + } + + shiftZ_block_fft(mesh.LocalNz, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, + nbatches, stream.get()); + } + + // Synchronize to ensure all shifts are complete. + stream.synchronize(); +#else + for (const auto& phase : parallel_slice_phases) { + auto& f_slice = f.ynext(phase.y_offset); + f_slice.allocate(); + BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -242,6 +568,7 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { &(f_slice(ix, iy_offset, 0))); } } +#endif } std::vector diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index ee9bcbcc2c..f12c517f82 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -173,7 +173,7 @@ Coordinates::FieldMetric D2DX2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DX2(f, outloc, method, region) / SQ(coords->dx); if (coords->non_uniform) { @@ -210,7 +210,7 @@ Coordinates::FieldMetric D2DY2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DY2(f, outloc, method, region) / SQ(coords->dy); if (coords->non_uniform) { // Correction for non-uniform f.getMesh()