From dc932cc56bdefce5f81dfa31fecf08fedcd0c4c6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 25 Feb 2025 13:57:25 -0800 Subject: [PATCH 01/29] Support BOUT_FOR_RAJA GPU field operators --- CMakeLists.txt | 9 +- include/bout/coordinates_accessor.hxx | 2 +- include/bout/field_accessor.hxx | 40 +++++- include/bout/mesh.hxx | 5 + include/bout/rajalib.hxx | 13 +- include/bout/single_index_ops.hxx | 11 -- src/field/gen_fieldops.jinja | 200 +++++++++++++++++++++----- src/field/gen_fieldops.py | 58 +++++++- src/mesh/coordinates.cxx | 5 +- src/mesh/coordinates_accessor.cxx | 18 ++- 10 files changed, 294 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c45fca3b72..f0a657fe94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -386,8 +386,15 @@ if (BOUT_GENERATE_FIELDOPS) if (NOT ClangFormat_FOUND) message(FATAL_ERROR "clang-format not found, but you have requested to generate code!") endif() + if (BOUT_ENABLE_RAJA) + set(GEN_LOOP_EXEC "raja") + elseif (BOUT_ENABLE_OPENMP) + set(GEN_LOOP_EXEC "openmp") + else() + set(GEN_LOOP_EXEC "serial") + endif() add_custom_command( OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/src/field/generated_fieldops.cxx - COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --filename generated_fieldops.cxx.tmp + COMMAND ${Python3_EXECUTABLE} gen_fieldops.py --loop-exec ${GEN_LOOP_EXEC} --filename generated_fieldops.cxx.tmp COMMAND ${ClangFormat_BIN} generated_fieldops.cxx.tmp -i COMMAND ${CMAKE_COMMAND} -E rename generated_fieldops.cxx.tmp generated_fieldops.cxx DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.jinja ${CMAKE_CURRENT_SOURCE_DIR}/src/field/gen_fieldops.py diff --git a/include/bout/coordinates_accessor.hxx b/include/bout/coordinates_accessor.hxx index 532351d57a..2376ab5039 100644 --- a/include/bout/coordinates_accessor.hxx +++ b/include/bout/coordinates_accessor.hxx @@ -31,7 +31,7 @@ /// -> If Coordinates data is changed, the cache should be cleared /// by calling CoordinatesAccessor::clear() struct CoordinatesAccessor { - CoordinatesAccessor() = delete; + CoordinatesAccessor() {} /// Constructor from Coordinates /// Copies data from coords, doesn't modify it diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index 69b58da979..71d0537d9e 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -57,10 +57,17 @@ struct FieldAccessor { /// Constructor from Field3D /// /// @param[in] f The field to access. Must already be allocated - explicit FieldAccessor(FieldType& f) : coords(f.getCoordinates()) { + explicit FieldAccessor(FieldType& f) { ASSERT0(f.getLocation() == location); ASSERT0(f.isAllocated()); + if (auto* Coords = f.getCoordinates()) { + coords = CoordinatesAccessor{Coords}; + } + else { + coords = CoordinatesAccessor{}; + } + data = BoutRealArray{&f(0, 0, 0)}; // Field size @@ -81,15 +88,19 @@ struct FieldAccessor { ddt = BoutRealArray{&(f.timeDeriv()->operator()(0, 0, 0))}; } + explicit FieldAccessor(const FieldType& f) : FieldAccessor(const_cast(f)) {} + /// Provide shorthand for access to field data. /// Does not convert between 3D and 2D indices, /// so fa[i] is equivalent to fa.data[i]. /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](const Ind3D& ind) { return data[ind.ind]; } // Pointers to the field data arrays // These are wrapped in BoutRealArray types so they can be indexed with Ind3D or int @@ -115,6 +126,9 @@ struct FieldAccessor { template using Field2DAccessor = FieldAccessor; +template +using Field3DAccessor = FieldAccessor; + /// Syntactic sugar for time derivative of a field /// /// Usage: @@ -130,4 +144,28 @@ BOUT_HOST_DEVICE inline BoutRealArray& ddt(const FieldAccessor(fa.ddt); } +struct FieldPerpAccessor { + FieldPerpAccessor() = delete; + + int nx, nz; + int yindex; + BoutReal* data; + + explicit FieldPerpAccessor(const FieldPerp& f) { + ASSERT0(f.isAllocated()); + + data = BoutRealArray{const_cast(&f(0, 0, 0))}; + + // Field size + nx = f.getNx(); + nz = f.getNz(); + + yindex = f.getIndex(); + } + + BOUT_HOST_DEVICE int getIndex() const { return yindex; } + BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } + BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } +}; + #endif diff --git a/include/bout/mesh.hxx b/include/bout/mesh.hxx index a1c88a2634..b6553d06ec 100644 --- a/include/bout/mesh.hxx +++ b/include/bout/mesh.hxx @@ -762,6 +762,11 @@ public: return {(indPerp.ind - jz) * LocalNy + LocalNz * jy + jz, LocalNy, LocalNz}; } + BOUT_HOST_DEVICE int flatIndPerpto3D(const int& flatIndPerp, const int nz, int jy = 0) const { + int jz = flatIndPerp % nz; + return (flatIndPerp - jz) * LocalNy + LocalNz * jy + jz; + } + /// Converts an Ind3D to an Ind2D representing a 2D index using a lookup -- to be used with care Ind2D map3Dto2D(const Ind3D& ind3D) { return {indexLookup3Dto2D[ind3D.ind], LocalNy, 1}; diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index b9f6913459..92eae68858 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -23,6 +23,15 @@ #include "RAJA/RAJA.hpp" // using RAJA lib +#if BOUT_HAS_CUDA +// TODO: Make configurable +const int CUDA_BLOCK_SIZE = 256; +using EXEC_POL = RAJA::cuda_exec; +//using EXEC_POL = RAJA::loop_exec; +#else // not BOUT_USE_CUDA +using EXEC_POL = RAJA::loop_exec; +#endif // end BOUT_USE_CUDA + /// Wrapper around RAJA::forall /// Enables computations to be done on CPU or GPU (CUDA). /// @@ -81,7 +90,7 @@ struct RajaForAll { // Note: must be a local variable const int* _ob_i_ind_raw = &_ob_i_ind[0]; RAJA::forall(RAJA::RangeSegment(0, _ob_i_ind.size()), - [=] RAJA_DEVICE(int id) { + [=] RAJA_DEVICE(int id) mutable { // Look up index and call user function f(_ob_i_ind_raw[id]); }); @@ -127,7 +136,7 @@ private: /// to create variables which shadow the class members. /// #define BOUT_FOR_RAJA(index, region, ...) \ - RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) +RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable #else // BOUT_HAS_RAJA diff --git a/include/bout/single_index_ops.hxx b/include/bout/single_index_ops.hxx index 60bd78bc36..c29d1a471f 100644 --- a/include/bout/single_index_ops.hxx +++ b/include/bout/single_index_ops.hxx @@ -7,17 +7,6 @@ #include "field_accessor.hxx" -#if BOUT_HAS_RAJA -//-- RAJA CUDA settings--------------------------------------------------------start -#if BOUT_HAS_CUDA -const int CUDA_BLOCK_SIZE = 256; // TODO: Make configurable -using EXEC_POL = RAJA::cuda_exec; -#else // not BOUT_USE_CUDA -using EXEC_POL = RAJA::loop_exec; -#endif // end BOUT_USE_CUDA -////-----------CUDA settings------------------------------------------------------end -#endif // end BOUT_HAS_RAJA - // Ind3D: i.zp(): BOUT_HOST_DEVICE inline int i_zp(const int id, const int nz) { int jz = id % nz; diff --git a/src/field/gen_fieldops.jinja b/src/field/gen_fieldops.jinja index ecd4e628cc..60f9cbbd7e 100644 --- a/src/field/gen_fieldops.jinja +++ b/src/field/gen_fieldops.jinja @@ -8,6 +8,26 @@ checkData({{lhs.name}}); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if out.field_type == "FieldPerp" %} + auto {{out.name}}_acc = FieldPerpAccessor{ {{out.name}} }; + {% else %} + auto {{out.name}}_acc = FieldAccessor({{out.name}}); + {% endif %} + {% if lhs.field_type == "FieldPerp" %} + auto {{lhs.name}}_acc = FieldPerpAccessor{ {{lhs.name}} }; + {% elif lhs.field_type == "BoutReal" %} + {% else %} + auto {{lhs.name}}_acc = FieldAccessor({{lhs.name}}); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if out == "Field3D" %} {% if lhs == rhs == "Field3D" %} {{out.name}}.setRegion({{lhs.name}}.getMesh()->getCommonRegion({{lhs.name}}.getRegionID(), @@ -20,45 +40,98 @@ {% endif %} {% if (out == "Field3D") and ((lhs == "Field2D") or (rhs =="Field2D")) %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}_acc.mesh_nz; + {% else %} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); + {% endif %} {% if (lhs == "Field2D") %} {{region_loop}}({{index_var}}, {{lhs.name}}.getRegion({{region_name}})) { {% else %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { {% endif %} - const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = localmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ - {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} * tmp; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} * tmp; + {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < localmesh->LocalNz; ++{{jz_var}}){ + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.mixed_index_acc}} = {{lhs.mixed_index_acc}} {{operator}} {{rhs.mixed_index_acc}}; + {% else %} + {{out.mixed_index}} = {{lhs.mixed_index}} {{operator}} {{rhs.mixed_index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif out == "FieldPerp" and (lhs == "Field2D" or lhs == "Field3D" or rhs == "Field2D" or rhs == "Field3D")%} Mesh *localmesh = {{lhs.name if lhs.field_type != "BoutReal" else rhs.name}}.getMesh(); {{region_loop}}({{index_var}}, {{out.name}}.getRegion({{region_name}})) { - int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}_acc.getIndex(); + {% else %} + int yind = {{lhs.name if lhs == "FieldPerp" else rhs.name}}.getIndex(); + {% endif %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + ; // DONE2 + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, result_acc.nz, yind); + {% else %} + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + {% endif %} {% if lhs != "FieldPerp" %} - {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.base_index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.base_index}} {{operator}} {{rhs.index}}; + {% endif %} {% else %} - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.base_index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.base_index}}; + {% endif %} {% endif %} - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (rhs == "BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} * tmp; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} * tmp; + {% else %} + {{out.index}} = {{lhs.index}} * tmp; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, {{out.name}}.getValidRegionWithDefault({{region_name}})) { - {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + {{out.index_acc}} = {{lhs.index_acc}} {{operator}} {{rhs.index_acc}}; + {% else %} + {{out.index}} = {{lhs.index}} {{operator}} {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData({{out.name}}); @@ -84,49 +157,102 @@ checkData(*this); checkData({{rhs.name}}); + {% if (region_loop == "BOUT_FOR_RAJA") %} + {% if lhs.field_type == "FieldPerp" %} + auto this_acc = FieldPerpAccessor{(*this)}; + {% else %} + auto this_acc = FieldAccessor(*this); + {% endif %} + {% if rhs.field_type == "FieldPerp" %} + auto {{rhs.name}}_acc = FieldPerpAccessor{ {{rhs.name}} }; + {% elif rhs.field_type == "BoutReal" %} + {% else %} + auto {{rhs.name}}_acc = FieldAccessor({{rhs.name}}); + {% endif %} + {% endif %} + {% if lhs == rhs == "Field3D" %} regionID = fieldmesh->getCommonRegion(regionID, {{rhs.name}}.regionID); {% endif %} - {% if (lhs == "Field3D") and (rhs =="Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + int mesh_nz = fieldmesh->LocalNz; + {% endif %} {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); - {% if (operator == "/") and (rhs == "Field2D") %} - const auto tmp = 1.0 / {{rhs.mixed_index}}; - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = {{index_var}} * mesh_nz; + {% else %} + const auto {{mixed_base_ind}} = fieldmesh->ind2Dto3D({{index_var}}); + {% endif %} + {% if (operator == "/") and (rhs == "Field2D") %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto tmp = 1.0 / {{rhs.mixed_index_acc}}; + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% else %} - for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ - (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + const auto tmp = 1.0 / {{rhs.mixed_index}}; + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] *= tmp; {% endif %} + {% else %} + {% if (region_loop == "BOUT_FOR_RAJA") %} + for (int {{jz_var}} = 0; {{jz_var}} < mesh_nz; ++{{jz_var}}){ + this_acc[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + for (int {{jz_var}} = 0; {{jz_var}} < fieldmesh->LocalNz; ++{{jz_var}}){ + (*this)[{{mixed_base_ind}} + {{jz_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + {% endif %} } - } + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif lhs == "FieldPerp" and (rhs == "Field3D" or rhs == "Field2D")%} Mesh *localmesh = this->getMesh(); + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = this->getIndex(); + {% endif %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - int yind = this->getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + const auto {{mixed_base_ind}} = localmesh->flatIndPerpto3D({{index_var}}, yind); + this_acc[{{index_var}}] {{operator}}= {{rhs.base_index_acc}}; + {% else %} + int yind = this->getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{index_var}}] {{operator}}= {{rhs.base_index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif rhs == "FieldPerp" and (lhs == "Field3D" or lhs == "Field2D")%} Mesh *localmesh = this->getMesh(); {{region_loop}}({{index_var}}, {{rhs.name}}.getRegion({{region_name}})) { - int yind = {{rhs.name}}.getIndex(); - const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); - (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + this_acc[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% else %} + int yind = {{rhs.name}}.getIndex(); + const auto {{mixed_base_ind}} = localmesh->indPerpto3D({{index_var}}, yind); + (*this)[{{base_ind_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% elif (operator == "/") and (lhs == "Field3D" or lhs == "Field2D") and (rhs =="BoutReal") %} const auto tmp = 1.0 / {{rhs.index}}; {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] *= tmp; + {% else %} (*this)[{{index_var}}] *= tmp; - } + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% else %} {{region_loop}}({{index_var}}, this->getRegion({{region_name}})) { - (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; - } + {% if (region_loop == "BOUT_FOR_RAJA") %} + this_acc[{{index_var}}] {{operator}}= {{rhs.index_acc}}; + {% else %} + (*this)[{{index_var}}] {{operator}}= {{rhs.index}}; + {% endif %} + }{% if (region_loop == "BOUT_FOR_RAJA") %};{% endif %} {% endif %} checkData(*this); diff --git a/src/field/gen_fieldops.py b/src/field/gen_fieldops.py index 29631ff7aa..bf06a8ea5c 100755 --- a/src/field/gen_fieldops.py +++ b/src/field/gen_fieldops.py @@ -132,6 +132,17 @@ def index(self): else: return "{self.name}[{self.index_var}]".format(self=self) + @property + def index_acc(self): + """Returns "_acc[{index_var}]" for an accessor-based index, except if + field_type is BoutReal, in which case just returns "" + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def mixed_index(self): """Returns "[{index_var} + {jz_var}]" if field_type is Field3D, @@ -147,6 +158,21 @@ def mixed_index(self): else: # Field2D return "{self.name}[{self.index_var}]".format(self=self) + @property + def mixed_index_acc(self): + """Returns "_acc[{index_var} + {jz_var}]" for an accessor if field_type + is Field3D, self.index if Field2D or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}_acc".format(self=self) + elif self.field_type == "Field3D": + return "{self.name}_acc[{self.mixed_base_ind_var} + {self.jz_var}]".format( + self=self + ) + else: # Field2D + return "{self.name}_acc[{self.index_var}]".format(self=self) + @property def base_index(self): """Returns "[{mixed_base_ind_var}]" if field_type is Field3D, Field2D or FieldPerp @@ -158,6 +184,18 @@ def base_index(self): else: return "{self.name}[{self.mixed_base_ind_var}]".format(self=self) + @property + def base_index_acc(self): + """Returns "_acc[{mixed_base_ind_var}]" for an accessor if field_type is + Field3D, Field2D or FieldPerp or just returns "" for BoutReal + + """ + if self.field_type == "BoutReal": + return "{self.name}".format(self=self) + else: + return "{self.name}_acc[{self.mixed_base_ind_var}]".format(self=self) + + def __eq__(self, other): try: return self.field_type == other.field_type @@ -198,11 +236,11 @@ def returnType(f1, f2): ) # By default use OpenMP enabled loops but allow to disable parser.add_argument( - "--no-openmp", - action="store_false", - default=False, - dest="noOpenMP", - help="Don't use OpenMP compatible loops", + "--loop-exec", + default="openmp", + dest="loop_exec", + choices=["serial", "openmp", "raja"], + help="Choose the loop execution method. Default is OpenMP", ) args = parser.parse_args() @@ -213,10 +251,16 @@ def returnType(f1, f2): mixed_base_ind_var = "base_ind" region_name = '"RGN_ALL"' - if args.noOpenMP: + if args.loop_exec == "openmp": + region_loop = "BOUT_FOR" + elif args.loop_exec == "raja": + region_loop = "BOUT_FOR_RAJA" + header += "#include \n" + header += "#include \n" + elif args.loop_exec == "serial": region_loop = "BOUT_FOR_SERIAL" else: - region_loop = "BOUT_FOR" + raise ValueError("Unknown loop execution method") # Declare what fields we currently support: # Field perp is currently missing diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 34c524d1e7..8123720144 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -23,6 +23,8 @@ #include "parallel/fci.hxx" #include "parallel/shiftedmetricinterp.hxx" +#include "bout/coordinates_accessor.hxx" + // use anonymous namespace so this utility function is not available outside this file namespace { template @@ -1203,10 +1205,11 @@ int Coordinates::geometry(bool recalculate_staggered, localmesh->recalculateStaggeredCoordinates(); } - // Invalidate and recalculate cached variables + // Invalidate and recalculate cached variables and any accessor zlength_cache.reset(); Grad2_par2_DDY_invSgCache.clear(); invSgCache.reset(); + CoordinatesAccessor::clear(this); return 0; } diff --git a/src/mesh/coordinates_accessor.cxx b/src/mesh/coordinates_accessor.cxx index aff546c2b0..196234d999 100644 --- a/src/mesh/coordinates_accessor.cxx +++ b/src/mesh/coordinates_accessor.cxx @@ -40,8 +40,9 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { // Copy data from Coordinates variable into data array // Uses the symbol to look up the corresponding Offset -#define COPY_STRIPE1(symbol) \ - data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; +#define COPY_STRIPE1(symbol) \ + if (coords->symbol.isAllocated()) \ + data[stripe_size * ind.ind + static_cast(Offset::symbol)] = coords->symbol[ind]; // Implement copy for each argument #define COPY_STRIPE(...) \ @@ -54,10 +55,15 @@ CoordinatesAccessor::CoordinatesAccessor(const Coordinates* coords) { COPY_STRIPE(d1_dx, d1_dy, d1_dz); COPY_STRIPE(J); - data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Byup)] = coords->Bxy.yup()[ind]; - data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = - coords->Bxy.ydown()[ind]; + if (coords->Bxy.isAllocated()) { + data[stripe_size * ind.ind + static_cast(Offset::B)] = coords->Bxy[ind]; + if (coords->Bxy.yup().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Byup)] = + coords->Bxy.yup()[ind]; + if (coords->Bxy.ydown().isAllocated()) + data[stripe_size * ind.ind + static_cast(Offset::Bydown)] = + coords->Bxy.ydown()[ind]; + } COPY_STRIPE(G1, G3); COPY_STRIPE(g11, g12, g13, g22, g23, g33); From b83d2b5286ab5f56e4ca257bf86dc952f84daf57 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 26 May 2025 18:57:31 -0700 Subject: [PATCH 02/29] Working WIP --- include/bout/field3d.hxx | 5 +- include/bout/field_accessor.hxx | 1 + include/bout/fieldops.hxx | 67 +++++++++++++++ include/bout/rajalib.hxx | 15 ++++ include/bout/vector3d.hxx | 1 + src/field/field3d.cxx | 3 +- src/field/generated_fieldops.cxx | 86 +++++++++++++++++-- src/field/vecops.cxx | 1 + .../laplace/impls/naulin/naulin_laplace.cxx | 1 + src/sys/derivs.cxx | 1 + 10 files changed, 173 insertions(+), 8 deletions(-) create mode 100644 include/bout/fieldops.hxx diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index a75e38df36..6d0624ff73 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -38,6 +38,7 @@ class Field3D; #include class Mesh; +class BinaryExpr; /// Class for 3D X-Y-Z scalar fields /*! @@ -183,6 +184,7 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + Field3D(const BinaryExpr& expr); /// Destructor ~Field3D() override; @@ -424,6 +426,7 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); + Field3D& operator=(BinaryExpr expr); ///@} /// Addition operators @@ -518,7 +521,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -Field3D operator+(const Field3D& lhs, const Field3D& rhs); +BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs); Field3D operator-(const Field3D& lhs, const Field3D& rhs); Field3D operator*(const Field3D& lhs, const Field3D& rhs); Field3D operator/(const Field3D& lhs, const Field3D& rhs); diff --git a/include/bout/field_accessor.hxx b/include/bout/field_accessor.hxx index 71d0537d9e..a43420d6b3 100644 --- a/include/bout/field_accessor.hxx +++ b/include/bout/field_accessor.hxx @@ -96,6 +96,7 @@ struct FieldAccessor { /// BOUT_HOST_DEVICE inline const BoutReal& operator[](int ind) const { return data[ind]; } BOUT_HOST_DEVICE inline BoutReal& operator[](int ind) { return data[ind]; } + __device__ inline BoutReal operator()(int i) const { return data[i]; } BOUT_HOST_DEVICE inline const BoutReal& operator[](const Ind3D& ind) const { return data[ind.ind]; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx new file mode 100644 index 0000000000..b80aaec446 --- /dev/null +++ b/include/bout/fieldops.hxx @@ -0,0 +1,67 @@ +#pragma once +#ifndef BOUT_FIELDOPS_HXX +#define BOUT_FIELDOPS_HXX + +#include "bout/bout_types.hxx" +#include "bout/field_accessor.hxx" + +struct Add { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } +}; +struct Sub { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } +}; +struct Mul { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } +}; +struct Div { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } +}; + +struct BinaryExpr { + struct RegionIndices { + int* data; + int size; + + RegionIndices(int n) : size(n) { + cudaMallocManaged(&data, n * sizeof(int)); + for (int i = 0; i < n; ++i) + data[i] = 0; + } + ~RegionIndices() { cudaFree(data); } + + __device__ inline int operator()(int idx) const { return data[idx]; } + }; + + using FieldType = FieldAccessor; + + FieldType lhs; + FieldType rhs; + RegionIndices indices; + Add op; + + Mesh* mesh; + CELL_LOC location = CELL_CENTRE; + DirectionTypes directions; + + template + BinaryExpr(FieldType lhs, FieldType rhs, Mesh* mesh, CELL_LOC location, + DirectionTypes directions, const Region& region) + : lhs(lhs), rhs(rhs), mesh(mesh), location(location), directions(directions), + indices(region.getIndices().size()) { + // Copy the region indices into the managed array + for (int i = 0; i < indices.size; ++i) { + indices.data[i] = region.getIndices()[i].ind; + } + } + + __host__ __device__ inline int getSize() const { return indices.size; } + __device__ inline int regionIdx(int idx) const { return indices(idx); } + __device__ inline BoutReal operator()(int idx) const { return op(lhs(idx), rhs(idx)); } + + Mesh* getMesh() const { return mesh; } + CELL_LOC getLocation() const { return location; } + DirectionTypes getDirections() const { return directions; } +}; + +#endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index 92eae68858..b3da46da50 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -14,6 +14,7 @@ */ #pragma once +#include "bout/array.hxx" #ifndef RAJALIB_H #define RAJALIB_H @@ -138,6 +139,20 @@ private: #define BOUT_FOR_RAJA(index, region, ...) \ RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable +// NEW STUFF + +template +__global__ void evaluator(BoutReal *out, Expr &expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = tid; i < expr.getSize(); i += stride) { + out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + } +} + +// END OF NEW STUFF + + #else // BOUT_HAS_RAJA #warning RAJA not enabled. BOUT_FOR_RAJA falling back to BOUT_FOR. diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx index 0c71dcffa5..ad68dc17ee 100644 --- a/include/bout/vector3d.hxx +++ b/include/bout/vector3d.hxx @@ -36,6 +36,7 @@ class Vector3D; class Field2D; class Vector2D; #include "bout/field3d.hxx" +#include "bout/fieldops.hxx" /*! * Represents a 3D vector, with x,y,z components diff --git a/src/field/field3d.cxx b/src/field/field3d.cxx index 0d2bc0694e..9ea488d8f1 100644 --- a/src/field/field3d.cxx +++ b/src/field/field3d.cxx @@ -805,7 +805,8 @@ bool operator==(const Field3D& a, const Field3D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return min(abs(a - b)) < 1e-10; + Field3D Sub = a - b; + return min(Sub) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field3D& value) { diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 6b778acee3..b1b99caaa9 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1,11 +1,73 @@ // This file is autogenerated - see gen_fieldops.py +#include "bout/rajalib.hxx" +#include "bout/fieldops.hxx" + #include #include +#include #include #include #include #include +template +struct ExprFor { + using type = T; +}; + +template <> +struct ExprFor { + using type = FieldAccessor; +}; + +template +using ExprFor_t = typename ExprFor>::type; + +//template +//class ExpressionExpr : public ExprBase { +//private: +// const Expression& lhs; +// const Expression& rhs; +// Op op; +// +//public: +// ExpressionExpr(const Expression& lhs, const Expression& rhs, Op op) +// : lhs(lhs), rhs(rhs), op(op), +// ExprBase(lhs.getMesh(), lhs.getLocation(), lhs.getDirections()) {} +// +// __device__ BoutReal operator()(int idx) const override { +// return op(lhs(idx), rhs(idx)); +// } +// +// __host__ __device__ int getSize() const override { +// return lhs.getSize(); // Assume same size +// } +// +// __device__ int regionIdx(int idx) const override { +// return lhs.regionIdx(idx); // Use lhs indexing +// } +//}; + +Field3D& Field3D::operator=(BinaryExpr expr) { + constexpr int THREADS = 256; + int blocks = (size() + THREADS - 1) / THREADS; + + // one kernel launch that writes each element exactly once + evaluator<<>>(&data[0], expr); + cudaDeviceSynchronize(); + return *this; +} +// +Field3D::Field3D(const BinaryExpr& expr) { + Array data{expr.getSize()}; + + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluator<<>>(&data[0], expr); + cudaDeviceSynchronize(); + *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; +} + // Provide the C++ wrapper for multiplication of Field3D and Field3D Field3D operator*(const Field3D& lhs, const Field3D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -21,6 +83,8 @@ Field3D operator*(const Field3D& lhs, const Field3D& rhs) { } checkData(result); + std::cout << "operator*\n"; + getchar(); return result; } @@ -65,6 +129,8 @@ Field3D operator/(const Field3D& lhs, const Field3D& rhs) { } checkData(result); + std::cout << "operator/\n"; + getchar(); return result; } @@ -95,7 +161,7 @@ Field3D& Field3D::operator/=(const Field3D& rhs) { } // Provide the C++ wrapper for addition of Field3D and Field3D -Field3D operator+(const Field3D& lhs, const Field3D& rhs) { +BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -104,12 +170,18 @@ Field3D operator+(const Field3D& lhs, const Field3D& rhs) { result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } + std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast>(lhs), + static_cast>(rhs), + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + result.getValidRegionWithDefault("RGN_ALL")}; - checkData(result); - return result; + //constexpr int THREADS = 256; + //int blocks = (BE.getSize() + THREADS - 1) / THREADS; + //evaluator<<>>(&result(0, 0, 0), BE); + //return result; } // Provide the C++ operator to update Field3D by addition with Field3D @@ -152,6 +224,8 @@ Field3D operator-(const Field3D& lhs, const Field3D& rhs) { result[index] = lhs[index] - rhs[index]; } + std::cout << "operator-\n"; + getchar(); checkData(result); return result; } diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 5f34e2af02..9b1105e7aa 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -28,6 +28,7 @@ #include #include +#include #include #include #include diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index e6f68d850d..74ec68dae9 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index ee9bcbcc2c..55e2c77a29 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -49,6 +49,7 @@ #include +#include #include #include From 970870958f390587c1c1dff9ae3ee1493aa74c8a Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 17:12:14 -0700 Subject: [PATCH 03/29] WIP 2 - Compiles but crashes. I suspect it's because nested operators copy Field3D on the device (rhs) --- include/bout/array.hxx | 2 +- include/bout/field3d.hxx | 52 +++++++++++++++++++-- include/bout/fieldops.hxx | 79 ++++++++++++++++++++++++++----- src/field/generated_fieldops.cxx | 80 +++++++++----------------------- 4 files changed, 140 insertions(+), 73 deletions(-) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index 2c42f15aad..b83c29c51d 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -66,7 +66,7 @@ struct ArrayData { #if BOUT_HAS_UMPIRE auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA - auto allocator = rm.getAllocator(umpire::resource::Pinned); + auto allocator = rm.getAllocator(umpire::resource::Unified); #else auto allocator = rm.getAllocator("HOST"); #endif diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 6d0624ff73..b03730ddfa 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -38,7 +38,10 @@ class Field3D; #include class Mesh; -class BinaryExpr; + +//template +//class BinaryExpr; +#include "bout/fieldops.hxx" /// Class for 3D X-Y-Z scalar fields /*! @@ -184,7 +187,15 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - Field3D(const BinaryExpr& expr); + template + Field3D(const BinaryExpr& expr) { + Array data{expr.getSize()}; + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], expr); + cudaDeviceSynchronize(); + *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; + } /// Destructor ~Field3D() override; @@ -415,6 +426,15 @@ public: return &data[(jx * ny + jy) * nz]; } + struct View { + BoutReal* data; + __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + }; + operator View() { return View{&data[0]}; } + + __device__ inline BoutReal operator()(int i) { return View()(i); } + __device__ inline BoutReal operator()(int i) const { return View()(i); } + ///////////////////////////////////////////////////////// // Operators @@ -426,7 +446,15 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - Field3D& operator=(BinaryExpr expr); + template + Field3D& operator=(BinaryExpr expr) { + constexpr int THREADS = 256; + int blocks = (expr.getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], expr); + cudaDeviceSynchronize(); + return *this; + } + ///@} /// Addition operators @@ -521,7 +549,23 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs); +template && is_expr_v>> +BinaryExpr operator+(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{lhs, + rhs, + BinaryExpr::Op::ADD, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + Field3D operator-(const Field3D& lhs, const Field3D& rhs); Field3D operator*(const Field3D& lhs, const Field3D& rhs); Field3D operator/(const Field3D& lhs, const Field3D& rhs); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b80aaec446..7cdb339854 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -3,7 +3,14 @@ #define BOUT_FIELDOPS_HXX #include "bout/bout_types.hxx" -#include "bout/field_accessor.hxx" + +#include +#include + +class Mesh; +class Field3D; + +#include struct Add { __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } @@ -18,7 +25,18 @@ struct Div { __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; +template +__global__ static void evaluatorExpr(BoutReal* out, Expr& expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + for (int i = tid; i < expr.getSize(); i += stride) { + out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + } +} + +template struct BinaryExpr { + enum class Op { ADD, SUB, MUL, DIV }; struct RegionIndices { int* data; int size; @@ -33,22 +51,22 @@ struct BinaryExpr { __device__ inline int operator()(int idx) const { return data[idx]; } }; - using FieldType = FieldAccessor; - - FieldType lhs; - FieldType rhs; + L lhs; + R rhs; RegionIndices indices; - Add op; + Op op; Mesh* mesh; CELL_LOC location = CELL_CENTRE; DirectionTypes directions; + std::optional regionID; template - BinaryExpr(FieldType lhs, FieldType rhs, Mesh* mesh, CELL_LOC location, - DirectionTypes directions, const Region& region) - : lhs(lhs), rhs(rhs), mesh(mesh), location(location), directions(directions), - indices(region.getIndices().size()) { + BinaryExpr(L lhs, R rhs, Op op, Mesh* mesh, CELL_LOC location, + DirectionTypes directions, std::optional regionID, + const Region& region) + : lhs(lhs), rhs(rhs), op(op), mesh(mesh), location(location), + directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size; ++i) { indices.data[i] = region.getIndices()[i].ind; @@ -57,11 +75,50 @@ struct BinaryExpr { __host__ __device__ inline int getSize() const { return indices.size; } __device__ inline int regionIdx(int idx) const { return indices(idx); } - __device__ inline BoutReal operator()(int idx) const { return op(lhs(idx), rhs(idx)); } + __device__ inline BoutReal operator()(int idx) const { + switch (op) { + case Op::ADD: + return Add{}(lhs(idx), rhs(idx)); + case Op::SUB: + return Sub{}(lhs(idx), rhs(idx)); + case Op::MUL: + return Mul{}(lhs(idx), rhs(idx)); + case Op::DIV: + return Div{}(lhs(idx), rhs(idx)); + } + } + + void evaluate(BoutReal* data) const {} Mesh* getMesh() const { return mesh; } CELL_LOC getLocation() const { return location; } DirectionTypes getDirections() const { return directions; } + std::optional getRegionID() const { return regionID; }; }; +//template +//struct Expr { +// using type = T; +//}; +// +//template <> +//struct Expr { +// using type = Field3D::View; +//}; + +// 1) detect our BinaryExpr template +template +struct is_binary_expr : std::false_type {}; +template +struct is_binary_expr> : std::true_type {}; + +// 2) detect “any subclass of Field” +// assuming Field is your common base class +template +constexpr bool is_field_v = std::is_base_of>::value; + +// 3) combine into “is one of our expression types” +template +constexpr bool is_expr_v = is_field_v || is_binary_expr>::value; + #endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index b1b99caaa9..2c414c8dde 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -10,45 +10,9 @@ #include #include -template -struct ExprFor { - using type = T; -}; - -template <> -struct ExprFor { - using type = FieldAccessor; -}; - -template -using ExprFor_t = typename ExprFor>::type; - -//template -//class ExpressionExpr : public ExprBase { -//private: -// const Expression& lhs; -// const Expression& rhs; -// Op op; -// -//public: -// ExpressionExpr(const Expression& lhs, const Expression& rhs, Op op) -// : lhs(lhs), rhs(rhs), op(op), -// ExprBase(lhs.getMesh(), lhs.getLocation(), lhs.getDirections()) {} -// -// __device__ BoutReal operator()(int idx) const override { -// return op(lhs(idx), rhs(idx)); -// } -// -// __host__ __device__ int getSize() const override { -// return lhs.getSize(); // Assume same size -// } -// -// __device__ int regionIdx(int idx) const override { -// return lhs.regionIdx(idx); // Use lhs indexing -// } -//}; - -Field3D& Field3D::operator=(BinaryExpr expr) { +#if 0 +template +Field3D& Field3D::operator=(BinaryExpr expr) { constexpr int THREADS = 256; int blocks = (size() + THREADS - 1) / THREADS; @@ -57,8 +21,12 @@ Field3D& Field3D::operator=(BinaryExpr expr) { cudaDeviceSynchronize(); return *this; } -// -Field3D::Field3D(const BinaryExpr& expr) { +template Field3D& + Field3D::operator= (BinaryExpr expr); +#endif + +#if 0 +Field3D::Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; constexpr int THREADS = 256; @@ -67,6 +35,7 @@ Field3D::Field3D(const BinaryExpr& expr) { cudaDeviceSynchronize(); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } +#endif // Provide the C++ wrapper for multiplication of Field3D and Field3D Field3D operator*(const Field3D& lhs, const Field3D& rhs) { @@ -83,8 +52,6 @@ Field3D operator*(const Field3D& lhs, const Field3D& rhs) { } checkData(result); - std::cout << "operator*\n"; - getchar(); return result; } @@ -129,8 +96,6 @@ Field3D operator/(const Field3D& lhs, const Field3D& rhs) { } checkData(result); - std::cout << "operator/\n"; - getchar(); return result; } @@ -160,29 +125,32 @@ Field3D& Field3D::operator/=(const Field3D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of Field3D and Field3D -BinaryExpr operator+(const Field3D& lhs, const Field3D& rhs) { +template +BinaryExpr operator+(const L& lhs, const R& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; checkData(lhs); checkData(rhs); - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast>(lhs), - static_cast>(rhs), + return BinaryExpr{lhs, + rhs, + BinaryExpr::Op::ADD, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), - result.getValidRegionWithDefault("RGN_ALL")}; - - //constexpr int THREADS = 256; - //int blocks = (BE.getSize() + THREADS - 1) / THREADS; - //evaluator<<>>(&result(0, 0, 0), BE); - //return result; + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } +template BinaryExpr operator+ (const Field3D& lhs, + const Field3D& rhs); +#endif // Provide the C++ operator to update Field3D by addition with Field3D Field3D& Field3D::operator+=(const Field3D& rhs) { @@ -224,8 +192,6 @@ Field3D operator-(const Field3D& lhs, const Field3D& rhs) { result[index] = lhs[index] - rhs[index]; } - std::cout << "operator-\n"; - getchar(); checkData(result); return result; } From e20da9d606a4eaee7581ad3d5ae6ac54f6e7a382 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 19:24:41 -0700 Subject: [PATCH 04/29] Working - WIP 3 - Uses Views to avoid copying uncopyable stuff --- include/bout/field3d.hxx | 17 +++++++++------- include/bout/fieldops.hxx | 41 ++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index b03730ddfa..ca62885fdf 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -192,7 +192,8 @@ public: Array data{expr.getSize()}; constexpr int THREADS = 256; int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], expr); + evaluatorExpr<<>>( + &data[0], static_cast::View>(expr)); cudaDeviceSynchronize(); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } @@ -427,10 +428,11 @@ public: } struct View { - BoutReal* data; + const BoutReal* data; __device__ inline BoutReal operator()(int idx) const { return data[idx]; } }; operator View() { return View{&data[0]}; } + operator View() const { return View{&data[0]}; } __device__ inline BoutReal operator()(int i) { return View()(i); } __device__ inline BoutReal operator()(int i) const { return View()(i); } @@ -450,7 +452,8 @@ public: Field3D& operator=(BinaryExpr expr) { constexpr int THREADS = 256; int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], expr); + evaluatorExpr<<>>( + &data[0], static_cast::View>(expr)); cudaDeviceSynchronize(); return *this; } @@ -551,13 +554,13 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{lhs, - rhs, - BinaryExpr::Op::ADD, + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::ADD, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7cdb339854..359177f3fc 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -26,7 +26,7 @@ struct Div { }; template -__global__ static void evaluatorExpr(BoutReal* out, Expr& expr) { +__global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i = tid; i < expr.getSize(); i += stride) { @@ -73,20 +73,33 @@ struct BinaryExpr { } } - __host__ __device__ inline int getSize() const { return indices.size; } - __device__ inline int regionIdx(int idx) const { return indices(idx); } - __device__ inline BoutReal operator()(int idx) const { - switch (op) { - case Op::ADD: - return Add{}(lhs(idx), rhs(idx)); - case Op::SUB: - return Sub{}(lhs(idx), rhs(idx)); - case Op::MUL: - return Mul{}(lhs(idx), rhs(idx)); - case Op::DIV: - return Div{}(lhs(idx), rhs(idx)); + __host__ inline int getSize() const { return indices.size; } + + struct View { + L lhs; + R rhs; + int* indices; + int size; + Op op; + + __host__ __device__ inline int getSize() const { return size; } + __device__ inline int regionIdx(int idx) const { return indices[idx]; } + __device__ inline BoutReal operator()(int idx) const { + switch (op) { + case Op::ADD: + return Add{}(lhs(idx), rhs(idx)); + case Op::SUB: + return Sub{}(lhs(idx), rhs(idx)); + case Op::MUL: + return Mul{}(lhs(idx), rhs(idx)); + case Op::DIV: + return Div{}(lhs(idx), rhs(idx)); + } } - } + }; + + operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } + operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } void evaluate(BoutReal* data) const {} From 096f57637216f9133d69a06b6aa8a8b6bdd82f7b Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Tue, 27 May 2025 23:54:55 -0700 Subject: [PATCH 05/29] WIP 3 - operatorors +=, -=, *=, /= are working --- include/bout/field3d.hxx | 105 ++++++++++++--- include/bout/fieldops.hxx | 7 +- src/field/generated_fieldops.cxx | 214 +------------------------------ 3 files changed, 96 insertions(+), 230 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index ca62885fdf..a299f5bfdc 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -190,11 +190,7 @@ public: template Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>( - &data[0], static_cast::View>(expr)); - cudaDeviceSynchronize(); + expr.evaluate(&data[0]); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; } /// Destructor @@ -450,11 +446,7 @@ public: Field3D& operator=(BoutReal val); template Field3D& operator=(BinaryExpr expr) { - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluatorExpr<<>>( - &data[0], static_cast::View>(expr)); - cudaDeviceSynchronize(); + expr.evaluate(&data[0]); return *this; } @@ -462,28 +454,56 @@ public: /// Addition operators ///@{ - Field3D& operator+=(const Field3D& rhs); + //Field3D& operator+=(const Field3D& rhs); + template >> + Field3D& operator+=(const R& rhs) { + printf("Running operator+= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) + rhs; + return *this; + } Field3D& operator+=(const Field2D& rhs); Field3D& operator+=(BoutReal rhs); ///@} /// Subtraction operators ///@{ - Field3D& operator-=(const Field3D& rhs); + //Field3D& operator-=(const Field3D& rhs); + template >> + Field3D& operator-=(const R& rhs) { + printf("Running operator-= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) - rhs; + return *this; + } Field3D& operator-=(const Field2D& rhs); Field3D& operator-=(BoutReal rhs); ///@} /// Multiplication operators ///@{ - Field3D& operator*=(const Field3D& rhs); + //Field3D& operator*=(const Field3D& rhs); + template >> + Field3D& operator*=(const R& rhs) { + printf("Running operator*= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) * rhs; + return *this; + } Field3D& operator*=(const Field2D& rhs); Field3D& operator*=(BoutReal rhs); ///@} /// Division operators ///@{ - Field3D& operator/=(const Field3D& rhs); + template >> + Field3D& operator/=(const R& rhs) { + printf("Running operator/= with CUDA\n"); + data.ensureUnique(); + (*this) = (*this) * rhs; + return *this; + } + //Field3D& operator/=(const Field3D& rhs); Field3D& operator/=(const Field2D& rhs); Field3D& operator/=(BoutReal rhs); ///@} @@ -546,6 +566,10 @@ private: // Non-member overloaded operators +template +constexpr bool always_false = false; + + // Binary operators FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); @@ -569,9 +593,56 @@ BinaryExpr operator+(const L& lhs, const R& : lhs.getMesh()->getRegion("RGN_ALL"))}; } -Field3D operator-(const Field3D& lhs, const Field3D& rhs); -Field3D operator*(const Field3D& lhs, const Field3D& rhs); -Field3D operator/(const Field3D& lhs, const Field3D& rhs); +template && is_expr_v>> +BinaryExpr operator-(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::SUB, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +template && is_expr_v>> +BinaryExpr operator*(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::MUL, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +template && is_expr_v>> +BinaryExpr operator/(const L& lhs, const R& rhs) { + auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); + + std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + BinaryExpr::Op::DIV, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} Field3D operator+(const Field3D& lhs, const Field2D& rhs); Field3D operator-(const Field3D& lhs, const Field2D& rhs); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 359177f3fc..2ef7161f6d 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -101,7 +101,12 @@ struct BinaryExpr { operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } - void evaluate(BoutReal* data) const {} + void evaluate(BoutReal* data) const { + constexpr int THREADS = 256; + int blocks = (getSize() + THREADS - 1) / THREADS; + evaluatorExpr<<>>(&data[0], static_cast(*this)); + cudaDeviceSynchronize(); + } Mesh* getMesh() const { return mesh; } CELL_LOC getLocation() const { return location; } diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 2c414c8dde..63cbf9b847 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -10,218 +10,6 @@ #include #include -#if 0 -template -Field3D& Field3D::operator=(BinaryExpr expr) { - constexpr int THREADS = 256; - int blocks = (size() + THREADS - 1) / THREADS; - - // one kernel launch that writes each element exactly once - evaluator<<>>(&data[0], expr); - cudaDeviceSynchronize(); - return *this; -} -template Field3D& - Field3D::operator= (BinaryExpr expr); -#endif - -#if 0 -Field3D::Field3D(const BinaryExpr& expr) { - Array data{expr.getSize()}; - - constexpr int THREADS = 256; - int blocks = (expr.getSize() + THREADS - 1) / THREADS; - evaluator<<>>(&data[0], expr); - cudaDeviceSynchronize(); - *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; -} -#endif - -// Provide the C++ wrapper for multiplication of Field3D and Field3D -Field3D operator*(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field3D -Field3D& Field3D::operator*=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field3D and Field3D -Field3D operator/(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field3D -Field3D& Field3D::operator/=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and Field3D -template -BinaryExpr operator+(const L& lhs, const R& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{lhs, - rhs, - BinaryExpr::Op::ADD, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} -template BinaryExpr operator+ (const Field3D& lhs, - const Field3D& rhs); -#endif - -// Provide the C++ operator to update Field3D by addition with Field3D -Field3D& Field3D::operator+=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -// Provide the C++ wrapper for subtraction of Field3D and Field3D -Field3D operator-(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field3D -Field3D& Field3D::operator-=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - // Provide the C++ wrapper for multiplication of Field3D and Field2D Field3D operator*(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -328,6 +116,7 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { return *this; } +#if 1 // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -350,6 +139,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { From 34cba4d30879dcfbe76e0dfb9bba43af965342d6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 00:19:27 -0700 Subject: [PATCH 06/29] WIP 4 - Use functor template parameter for operation --- include/bout/field3d.hxx | 26 +++++++++---------- include/bout/fieldops.hxx | 53 ++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index a299f5bfdc..9696fce77d 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -39,8 +39,6 @@ class Field3D; class Mesh; -//template -//class BinaryExpr; #include "bout/fieldops.hxx" /// Class for 3D X-Y-Z scalar fields @@ -187,8 +185,8 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - template - Field3D(const BinaryExpr& expr) { + template + Field3D(const BinaryExpr& expr) { Array data{expr.getSize()}; expr.evaluate(&data[0]); *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; @@ -444,8 +442,8 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - template - Field3D& operator=(BinaryExpr expr) { + template + Field3D& operator=(BinaryExpr expr) { expr.evaluate(&data[0]); return *this; } @@ -578,13 +576,13 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::ADD, + bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -595,13 +593,13 @@ BinaryExpr operator+(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { +BinaryExpr operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::SUB, + bout::op::Sub{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -612,13 +610,13 @@ BinaryExpr operator-(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +BinaryExpr operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::MUL, + bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), @@ -629,13 +627,13 @@ BinaryExpr operator*(const L& lhs, const R& template && is_expr_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { +BinaryExpr operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; return BinaryExpr{static_cast(lhs), static_cast(rhs), - BinaryExpr::Op::DIV, + bout::op::Div{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 2ef7161f6d..b2052b12a1 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -12,17 +12,21 @@ class Field3D; #include -struct Add { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } -}; -struct Sub { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } -}; -struct Mul { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } +namespace bout { +namespace op { + struct Add { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } + }; + struct Sub { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } + }; + struct Mul { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } + }; + struct Div { + __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + }; }; -struct Div { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; template @@ -34,7 +38,7 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { } } -template +template struct BinaryExpr { enum class Op { ADD, SUB, MUL, DIV }; struct RegionIndices { @@ -54,7 +58,7 @@ struct BinaryExpr { L lhs; R rhs; RegionIndices indices; - Op op; + Func f; Mesh* mesh; CELL_LOC location = CELL_CENTRE; @@ -62,10 +66,10 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(L lhs, R rhs, Op op, Mesh* mesh, CELL_LOC location, + BinaryExpr(L lhs, R rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : lhs(lhs), rhs(rhs), op(op), mesh(mesh), location(location), + : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size; ++i) { @@ -80,26 +84,17 @@ struct BinaryExpr { R rhs; int* indices; int size; - Op op; + Func f; __host__ __device__ inline int getSize() const { return size; } __device__ inline int regionIdx(int idx) const { return indices[idx]; } __device__ inline BoutReal operator()(int idx) const { - switch (op) { - case Op::ADD: - return Add{}(lhs(idx), rhs(idx)); - case Op::SUB: - return Sub{}(lhs(idx), rhs(idx)); - case Op::MUL: - return Mul{}(lhs(idx), rhs(idx)); - case Op::DIV: - return Div{}(lhs(idx), rhs(idx)); - } + f(lhs(idx), rhs(idx)); // single‐pass fusion } }; - operator View() { return View{lhs, rhs, indices.data, indices.size, op}; } - operator View() const { return View{lhs, rhs, indices.data, indices.size, op}; } + operator View() { return View{lhs, rhs, indices.data, indices.size, f}; } + operator View() const { return View{lhs, rhs, indices.data, indices.size, f}; } void evaluate(BoutReal* data) const { constexpr int THREADS = 256; @@ -127,8 +122,8 @@ struct BinaryExpr { // 1) detect our BinaryExpr template template struct is_binary_expr : std::false_type {}; -template -struct is_binary_expr> : std::true_type {}; +template +struct is_binary_expr> : std::true_type {}; // 2) detect “any subclass of Field” // assuming Field is your common base class From 7d75b9db770db6ab7a30d3b7c27151e817e82086 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 00:29:57 -0700 Subject: [PATCH 07/29] Used managed array for indices --- include/bout/fieldops.hxx | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b2052b12a1..2ea6ac44a6 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -41,23 +41,9 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { template struct BinaryExpr { enum class Op { ADD, SUB, MUL, DIV }; - struct RegionIndices { - int* data; - int size; - - RegionIndices(int n) : size(n) { - cudaMallocManaged(&data, n * sizeof(int)); - for (int i = 0; i < n; ++i) - data[i] = 0; - } - ~RegionIndices() { cudaFree(data); } - - __device__ inline int operator()(int idx) const { return data[idx]; } - }; - L lhs; R rhs; - RegionIndices indices; + Array indices; Func f; Mesh* mesh; @@ -72,17 +58,17 @@ struct BinaryExpr { : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array - for (int i = 0; i < indices.size; ++i) { - indices.data[i] = region.getIndices()[i].ind; + for (int i = 0; i < indices.size(); ++i) { + indices[i] = region.getIndices()[i].ind; } } - __host__ inline int getSize() const { return indices.size; } + __host__ inline int getSize() const { return indices.size(); } struct View { L lhs; R rhs; - int* indices; + const int* indices; int size; Func f; @@ -93,8 +79,8 @@ struct BinaryExpr { } }; - operator View() { return View{lhs, rhs, indices.data, indices.size, f}; } - operator View() const { return View{lhs, rhs, indices.data, indices.size, f}; } + operator View() { return View{lhs, rhs, &indices[0], indices.size(), f}; } + operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { constexpr int THREADS = 256; From b8e7e973e0a4a245c1aa1be2933a0c830a2818e6 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 02:03:25 -0700 Subject: [PATCH 08/29] Better SFINAE for specializations --- include/bout/field3d.hxx | 67 ++++++++++++++++++++++++++++++++------- include/bout/fieldops.hxx | 21 +++++------- 2 files changed, 63 insertions(+), 25 deletions(-) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 9696fce77d..cd981ff60d 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -40,6 +40,13 @@ class Field3D; class Mesh; #include "bout/fieldops.hxx" +// Base template: nothing is an expression by default +template +struct is_expr_field3d : std::false_type {}; + +// Helper variable template +template +inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -453,7 +460,7 @@ public: /// Addition operators ///@{ //Field3D& operator+=(const Field3D& rhs); - template >> + template >> Field3D& operator+=(const R& rhs) { printf("Running operator+= with CUDA\n"); data.ensureUnique(); @@ -467,7 +474,7 @@ public: /// Subtraction operators ///@{ //Field3D& operator-=(const Field3D& rhs); - template >> + template >> Field3D& operator-=(const R& rhs) { printf("Running operator-= with CUDA\n"); data.ensureUnique(); @@ -481,7 +488,7 @@ public: /// Multiplication operators ///@{ //Field3D& operator*=(const Field3D& rhs); - template >> + template >> Field3D& operator*=(const R& rhs) { printf("Running operator*= with CUDA\n"); data.ensureUnique(); @@ -494,7 +501,7 @@ public: /// Division operators ///@{ - template >> + template >> Field3D& operator/=(const R& rhs) { printf("Running operator/= with CUDA\n"); data.ensureUnique(); @@ -575,8 +582,9 @@ FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator+(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; @@ -592,8 +600,9 @@ BinaryExpr operator+(const L& } template && is_expr_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator-(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; @@ -609,8 +618,9 @@ BinaryExpr operator-(const L& } template && is_expr_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator*(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; @@ -626,8 +636,9 @@ BinaryExpr operator*(const L& } template && is_expr_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { + typename = std::enable_if_t && is_expr_field3d_v>> +BinaryExpr operator/(const L& lhs, + const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; @@ -643,6 +654,22 @@ BinaryExpr operator/(const L& } Field3D operator+(const Field3D& lhs, const Field2D& rhs); +// template && is_expr_field2d_v>> +// BinaryExpr operator+(const L& lhs, const R& rhs) { +// auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); +// +// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; +// return BinaryExpr{static_cast(lhs), +// static_cast(rhs), +// bout::op::Add{}, +// lhs.getMesh(), +// lhs.getLocation(), +// lhs.getDirections(), +// regionID, +// (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) +// : lhs.getMesh()->getRegion("RGN_ALL"))}; +// } Field3D operator-(const Field3D& lhs, const Field2D& rhs); Field3D operator*(const Field3D& lhs, const Field2D& rhs); Field3D operator/(const Field3D& lhs, const Field2D& rhs); @@ -769,4 +796,20 @@ bool operator==(const Field3D& a, const Field3D& b); /// Output a string describing a Field3D to a stream std::ostream& operator<<(std::ostream& out, const Field3D& value); +// A raw Field3D is an expression leaf +template <> +struct is_expr_field3d : std::true_type {}; +template <> +struct is_expr_field3d : std::true_type {}; + +// Any nested BinaryExpr is an expression iff L is +template +struct is_expr_field3d> + : std::true_type {}; + +//template +//struct is_expr_field3d< typename BinaryExpr::View > +// : std::integral_constant>::value> {}; +// //: is_expr_field3d> {}; + #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 2ea6ac44a6..7c7f511cc3 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -40,7 +40,6 @@ __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { template struct BinaryExpr { - enum class Op { ADD, SUB, MUL, DIV }; L lhs; R rhs; Array indices; @@ -95,16 +94,7 @@ struct BinaryExpr { std::optional getRegionID() const { return regionID; }; }; -//template -//struct Expr { -// using type = T; -//}; -// -//template <> -//struct Expr { -// using type = Field3D::View; -//}; - +#if 0 // 1) detect our BinaryExpr template template struct is_binary_expr : std::false_type {}; @@ -114,10 +104,15 @@ struct is_binary_expr> : std::true_type {}; // 2) detect “any subclass of Field” // assuming Field is your common base class template -constexpr bool is_field_v = std::is_base_of>::value; +constexpr bool is_field3d_v = std::is_base_of>::value; // 3) combine into “is one of our expression types” template -constexpr bool is_expr_v = is_field_v || is_binary_expr>::value; +constexpr bool is_expr_field3d_v = + is_field3d_v || is_binary_expr>::value; +#endif + +#if 1 +#endif #endif // BOUT_EXPRESSION_HXX \ No newline at end of file From d830f8d781c48921a9f6ce5ec5a519d162c9fa3a Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 28 May 2025 23:15:30 -0700 Subject: [PATCH 09/29] Fix major bug in the binary expr operator() and add operators - Working version --- include/bout/field2d.hxx | 10 + include/bout/field3d.hxx | 163 ++++++++++----- include/bout/fieldops.hxx | 84 ++++---- include/bout/rajalib.hxx | 2 +- include/bout/vector3d.hxx | 1 - src/field/generated_fieldops.cxx | 193 +++++++++++++++++- src/field/vecops.cxx | 1 - .../laplace/impls/naulin/naulin_laplace.cxx | 1 - src/sys/derivs.cxx | 1 - 9 files changed, 348 insertions(+), 108 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 92658f1bbf..5f0901ac67 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -276,6 +276,16 @@ public: int size() const override { return nx * ny; }; + struct View { + const BoutReal* data; + __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{&data[0]}; } + + __device__ inline BoutReal operator()(int i) { return View()(i); } + __device__ inline BoutReal operator()(int i) const { return View()(i); } + private: /// Internal data array. Handles allocation/freeing of memory Array data; diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index cd981ff60d..c6bc02faf9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -44,10 +44,16 @@ class Mesh; template struct is_expr_field3d : std::false_type {}; +template +struct is_expr_field2d : std::false_type {}; + // Helper variable template template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; +template +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valuen; + /// Class for 3D X-Y-Z scalar fields /*! This class represents a scalar field defined over the mesh. @@ -194,9 +200,12 @@ public: ZDirectionType::Standard}); template Field3D(const BinaryExpr& expr) { - Array data{expr.getSize()}; + std::cout << "RUNNING constructor from BinaryExpr\n"; + Array data{expr.size()}; expr.evaluate(&data[0]); - *this = Field3D{data, expr.getMesh(), expr.getLocation(), expr.getDirections()}; + *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + setRegion(expr.getRegionID()); } /// Destructor ~Field3D() override; @@ -430,13 +439,12 @@ public: struct View { const BoutReal* data; - __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + __host__ __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + //__device__ inline const BoutReal* operator()() const { return data; } }; operator View() { return View{&data[0]}; } operator View() const { return View{&data[0]}; } - __device__ inline BoutReal operator()(int i) { return View()(i); } - __device__ inline BoutReal operator()(int i) const { return View()(i); } ///////////////////////////////////////////////////////// // Operators @@ -450,7 +458,9 @@ public: void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); template - Field3D& operator=(BinaryExpr expr) { + Field3D& operator=(BinaryExpr& expr) { + std::cout << "RUNNING operator= with CUDA\n"; + regionID = expr.getRegionID(); expr.evaluate(&data[0]); return *this; } @@ -462,9 +472,19 @@ public: //Field3D& operator+=(const Field3D& rhs); template >> Field3D& operator+=(const R& rhs) { - printf("Running operator+= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) + rhs; + printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) + rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) + rhs; + } + return *this; } Field3D& operator+=(const Field2D& rhs); @@ -476,9 +496,18 @@ public: //Field3D& operator-=(const Field3D& rhs); template >> Field3D& operator-=(const R& rhs) { - printf("Running operator-= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) - rhs; + if (data.unique()) { + printf("RUNNING operator-= with CUDA with BE\n"); + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + auto BE = (*this) - rhs; + BE.evaluate(&data[0]); + } else { + printf("RUNNING operator-= with CUDA with operation\n"); + (*this) = (*this) - rhs; + } + return *this; } Field3D& operator-=(const Field2D& rhs); @@ -490,9 +519,19 @@ public: //Field3D& operator*=(const Field3D& rhs); template >> Field3D& operator*=(const R& rhs) { - printf("Running operator*= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) * rhs; + printf("RUNNING operator*= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) * rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + return *this; } Field3D& operator*=(const Field2D& rhs); @@ -501,14 +540,24 @@ public: /// Division operators ///@{ + //Field3D& operator/=(const Field3D& rhs); template >> Field3D& operator/=(const R& rhs) { - printf("Running operator/= with CUDA\n"); - data.ensureUnique(); - (*this) = (*this) * rhs; + printf("RUNNING operator/= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) / rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + return *this; } - //Field3D& operator/=(const Field3D& rhs); Field3D& operator/=(const Field2D& rhs); Field3D& operator/=(BoutReal rhs); ///@} @@ -571,10 +620,9 @@ private: // Non-member overloaded operators -template +template constexpr bool always_false = false; - // Binary operators FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); @@ -583,13 +631,12 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); template && is_expr_field3d_v>> -BinaryExpr operator+(const L& lhs, - const R& rhs) { +BinaryExpr operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), @@ -601,13 +648,12 @@ BinaryExpr operator+(const L& template && is_expr_field3d_v>> -BinaryExpr operator-(const L& lhs, - const R& rhs) { +BinaryExpr operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Sub{}, lhs.getMesh(), lhs.getLocation(), @@ -619,13 +665,12 @@ BinaryExpr operator-(const L& template && is_expr_field3d_v>> -BinaryExpr operator*(const L& lhs, - const R& rhs) { +BinaryExpr operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), @@ -637,13 +682,12 @@ BinaryExpr operator*(const L& template && is_expr_field3d_v>> -BinaryExpr operator/(const L& lhs, - const R& rhs) { +BinaryExpr operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), + return BinaryExpr{(lhs), + (rhs), bout::op::Div{}, lhs.getMesh(), lhs.getLocation(), @@ -654,22 +698,26 @@ BinaryExpr operator/(const L& } Field3D operator+(const Field3D& lhs, const Field2D& rhs); -// template && is_expr_field2d_v>> -// BinaryExpr operator+(const L& lhs, const R& rhs) { -// auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); +//template +//auto operator+(const L& lhs, const R& rhs) +// -> std::enable_if_t && is_expr_field2d_v, +// BinaryExpr> { +// static_assert(always_false || always_false, "Hello"); +// auto regionID = lhs.getRegionID(); // -// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; -// return BinaryExpr{static_cast(lhs), -// static_cast(rhs), -// bout::op::Add{}, -// lhs.getMesh(), -// lhs.getLocation(), -// lhs.getDirections(), -// regionID, -// (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) -// : lhs.getMesh()->getRegion("RGN_ALL"))}; -// } +// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; +// int mesh_nz = lhs.getMesh()->LocalNz; +// auto LambdaOp = [mesh_nz]() { +// }; +// return BinaryExpr{(lhs), +// (rhs), +// bout::op::Add{}, +// lhs.getMesh(), +// lhs.getLocation(), +// lhs.getDirections(), +// regionID, +// rhs.getRegion("RGN_ALL")}; +//} Field3D operator-(const Field3D& lhs, const Field2D& rhs); Field3D operator*(const Field3D& lhs, const Field2D& rhs); Field3D operator/(const Field3D& lhs, const Field2D& rhs); @@ -713,7 +761,7 @@ void checkData(const Field3D& f, const std::string& region = "RGN_NOBNDRY"); /// Ignored with disabled CHECK; Throw an exception if \p f is not /// allocated or if any elements are non-finite (for CHECK > 2) inline void checkData(const Field3D& UNUSED(f), - const std::string& UNUSED(region) = "RGN_NOBNDRY"){}; + const std::string& UNUSED(region) = "RGN_NOBNDRY") {}; #endif /// Fourier filtering, removes all except one mode @@ -799,13 +847,18 @@ std::ostream& operator<<(std::ostream& out, const Field3D& value); // A raw Field3D is an expression leaf template <> struct is_expr_field3d : std::true_type {}; + template <> -struct is_expr_field3d : std::true_type {}; +struct is_expr_field2d : std::true_type {}; // Any nested BinaryExpr is an expression iff L is +//template +//struct is_expr_field3d> +// : std::true_type {}; + template struct is_expr_field3d> - : std::true_type {}; + : std::integral_constant>::value> {}; //template //struct is_expr_field3d< typename BinaryExpr::View > diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 7c7f511cc3..e9c87f242f 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -15,16 +15,26 @@ class Field3D; namespace bout { namespace op { struct Add { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { + return L(idx) + R(idx); + } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; struct Sub { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } - }; - struct Mul { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } - }; - struct Div { - __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) - R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } + }; + struct Mul { + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) * R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } + }; + struct Div { + template + __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) / R(idx); } + __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } }; }; }; @@ -33,15 +43,17 @@ template __global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.getSize(); i += stride) { + for (int i = tid; i < expr.size(); i += stride) { out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion } } template struct BinaryExpr { - L lhs; - R rhs; + const L &LHS; + const R &RHS; + typename L::View lhs; + typename R::View rhs; Array indices; Func f; @@ -51,30 +63,36 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(L lhs, R rhs, Func f, Mesh* mesh, CELL_LOC location, + BinaryExpr(const L &lhs, const R &rhs, Func f, Mesh* mesh, CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), - directions(directions), regionID(regionID), indices(region.getIndices().size()) { + : LHS(lhs), RHS(rhs), lhs(static_cast(lhs)), rhs(static_cast(rhs)), + f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), + indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } } - __host__ inline int getSize() const { return indices.size(); } + inline int size() const { return indices.size(); } + inline BoutReal operator()(int idx) const { + return f(idx, lhs, rhs); // single‐pass fusion + } + inline int regionIdx(int idx) const { return indices[idx]; } struct View { - L lhs; - R rhs; + typename L::View lhs; + typename R::View rhs; const int* indices; - int size; + int num_indices; Func f; - __host__ __device__ inline int getSize() const { return size; } + __device__ inline int size() const { return num_indices; } __device__ inline int regionIdx(int idx) const { return indices[idx]; } __device__ inline BoutReal operator()(int idx) const { - f(lhs(idx), rhs(idx)); // single‐pass fusion + return f(idx, lhs, rhs); // single‐pass fusion + //return f(lhs(idx), rhs(idx)); // single‐pass fusion } }; @@ -83,9 +101,12 @@ struct BinaryExpr { void evaluate(BoutReal* data) const { constexpr int THREADS = 256; - int blocks = (getSize() + THREADS - 1) / THREADS; + int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); + //for(int i=0; i getRegionID() const { return regionID; }; }; -#if 0 -// 1) detect our BinaryExpr template -template -struct is_binary_expr : std::false_type {}; -template -struct is_binary_expr> : std::true_type {}; - -// 2) detect “any subclass of Field” -// assuming Field is your common base class -template -constexpr bool is_field3d_v = std::is_base_of>::value; - -// 3) combine into “is one of our expression types” -template -constexpr bool is_expr_field3d_v = - is_field3d_v || is_binary_expr>::value; -#endif - -#if 1 - -#endif #endif // BOUT_EXPRESSION_HXX \ No newline at end of file diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index b3da46da50..d61a58e0d8 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -145,7 +145,7 @@ template __global__ void evaluator(BoutReal *out, Expr &expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.getSize(); i += stride) { + for (int i = tid; i < expr.size(); i += stride) { out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion } } diff --git a/include/bout/vector3d.hxx b/include/bout/vector3d.hxx index ad68dc17ee..0c71dcffa5 100644 --- a/include/bout/vector3d.hxx +++ b/include/bout/vector3d.hxx @@ -36,7 +36,6 @@ class Vector3D; class Field2D; class Vector2D; #include "bout/field3d.hxx" -#include "bout/fieldops.hxx" /*! * Represents a 3D vector, with x,y,z components diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 63cbf9b847..c35ae2e866 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1,15 +1,198 @@ // This file is autogenerated - see gen_fieldops.py -#include "bout/rajalib.hxx" -#include "bout/fieldops.hxx" - #include #include -#include #include #include #include #include +// Provide the C++ wrapper for multiplication of Field3D and Field3D +Field3D operator*(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] * rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by multiplication with Field3D +#if 0 +Field3D& Field3D::operator*=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) * rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for division of Field3D and Field3D +Field3D operator/(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] / rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by division with Field3D +#if 0 +Field3D& Field3D::operator/=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) / rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for addition of Field3D and Field3D +Field3D operator+(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] + rhs[index]; + } + + checkData(result); + return result; +} + +#if 0 +// Provide the C++ operator to update Field3D by addition with Field3D +Field3D& Field3D::operator+=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } + + checkData(*this); + + } else { + (*this) = (*this) + rhs; + } + return *this; +} +#endif + +// Provide the C++ wrapper for subtraction of Field3D and Field3D +Field3D operator-(const Field3D& lhs, const Field3D& rhs) { + ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); + + Field3D result{emptyFrom(lhs)}; + checkData(lhs); + checkData(rhs); + + result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); + + BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { + result[index] = lhs[index] - rhs[index]; + } + + checkData(result); + return result; +} + +// Provide the C++ operator to update Field3D by subtraction with Field3D +#if 0 +Field3D& Field3D::operator-=(const Field3D& rhs) { + // only if data is unique we update the field + // otherwise just call the non-inplace version + if (data.unique()) { + ASSERT1_FIELDS_COMPATIBLE(*this, rhs); + + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + checkData(*this); + checkData(rhs); + + regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); + + BOUT_FOR(index, this->getRegion("RGN_ALL")) { + (*this)[index] -= rhs[index]; + printf("[golden] val[%d] %lf\n", index, (*this)[index]); + } + + checkData(*this); + + } else { + (*this) = (*this) - rhs; + } + return *this; +} +#endif + // Provide the C++ wrapper for multiplication of Field3D and Field2D Field3D operator*(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -116,7 +299,6 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { return *this; } -#if 1 // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -139,7 +321,6 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } -#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 9b1105e7aa..5f34e2af02 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -28,7 +28,6 @@ #include #include -#include #include #include #include diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index 74ec68dae9..e6f68d850d 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -142,7 +142,6 @@ #include #include #include -#include #include #include #include diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index 55e2c77a29..ee9bcbcc2c 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -49,7 +49,6 @@ #include -#include #include #include From c5f9fd668138b61cbd3ce6542177d538ac7becec Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Thu, 29 May 2025 22:40:53 -0700 Subject: [PATCH 10/29] WIP - More operators --- include/bout/array.hxx | 2 +- include/bout/field2d.hxx | 33 +++- include/bout/field3d.hxx | 254 ++++++++++++++++++---------- include/bout/fieldops.hxx | 157 +++++++++++++---- src/field/generated_fieldops.cxx | 281 +++++++++++-------------------- 5 files changed, 418 insertions(+), 309 deletions(-) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index b83c29c51d..2c42f15aad 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -66,7 +66,7 @@ struct ArrayData { #if BOUT_HAS_UMPIRE auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA - auto allocator = rm.getAllocator(umpire::resource::Unified); + auto allocator = rm.getAllocator(umpire::resource::Pinned); #else auto allocator = rm.getAllocator("HOST"); #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 5f0901ac67..db2ce194f8 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -277,11 +277,22 @@ public: int size() const override { return nx * ny; }; struct View { - const BoutReal* data; - __device__ inline BoutReal operator()(int idx) const { return data[idx]; } + BoutReal* data; + int mul = 1; + int div = 1; + __device__ inline BoutReal operator()(int idx) const { return data[(idx*mul/div)]; } + __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul)/div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } }; operator View() { return View{&data[0]}; } - operator View() const { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } __device__ inline BoutReal operator()(int i) { return View()(i); } __device__ inline BoutReal operator()(int i) const { return View()(i); } @@ -302,6 +313,22 @@ private: Field2D operator+(const Field2D& lhs, const Field2D& rhs); Field2D operator-(const Field2D& lhs, const Field2D& rhs); Field2D operator*(const Field2D& lhs, const Field2D& rhs); +#if 0 +template && is_expr_field2d_v>> +BinaryExpr operator*(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + lhs.getRegionID(), + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} +#endif Field2D operator/(const Field2D& lhs, const Field2D& rhs); Field3D operator+(const Field2D& lhs, const Field3D& rhs); diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index c6bc02faf9..33e42cccbb 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -52,7 +52,7 @@ template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; template -inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valuen; +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -200,7 +200,7 @@ public: ZDirectionType::Standard}); template Field3D(const BinaryExpr& expr) { - std::cout << "RUNNING constructor from BinaryExpr\n"; + //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field3D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -438,13 +438,30 @@ public: } struct View { - const BoutReal* data; - __host__ __device__ inline BoutReal operator()(int idx) const { return data[idx]; } - //__device__ inline const BoutReal* operator()() const { return data; } + BoutReal* data; + int mul = 1; + int div = 1; + int offset = 0; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div + offset]; + } + __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div + offset]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + View& setOffset(int o) { + offset = o; + return *this; + } }; operator View() { return View{&data[0]}; } - operator View() const { return View{&data[0]}; } - + operator View() const { return View{const_cast(&data[0])}; } + //operator View() const { return View{&data[0]}; } ///////////////////////////////////////////////////////// // Operators @@ -461,7 +478,8 @@ public: Field3D& operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); - expr.evaluate(&data[0]); + //expr.evaluate(&data[0]); + expr.evaluateWithResult(static_cast(*this)); return *this; } @@ -472,15 +490,17 @@ public: //Field3D& operator+=(const Field3D& rhs); template >> Field3D& operator+=(const R& rhs) { - printf("RUNNING operator+= with CUDA\n"); + //printf("RUNNING operator+= with CUDA\n"); if (data.unique()) { + printf("RUNNING operator+= with CUDA with evaluateWithResult\n"); // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) + rhs; regionID = BE.getRegionID(); - BE.evaluate(&data[0]); + //BE.evaluate(&data[0]); + BE.evaluateWithResult(static_cast(*this)); } else { (*this) = (*this) + rhs; } @@ -497,14 +517,14 @@ public: template >> Field3D& operator-=(const R& rhs) { if (data.unique()) { - printf("RUNNING operator-= with CUDA with BE\n"); + //printf("RUNNING operator-= with CUDA with BE\n"); // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) - rhs; BE.evaluate(&data[0]); } else { - printf("RUNNING operator-= with CUDA with operation\n"); + //printf("RUNNING operator-= with CUDA with operation\n"); (*this) = (*this) - rhs; } @@ -519,7 +539,7 @@ public: //Field3D& operator*=(const Field3D& rhs); template >> Field3D& operator*=(const R& rhs) { - printf("RUNNING operator*= with CUDA\n"); + //printf("RUNNING operator*= with CUDA\n"); if (data.unique()) { // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. @@ -541,9 +561,9 @@ public: /// Division operators ///@{ //Field3D& operator/=(const Field3D& rhs); - template >> - Field3D& operator/=(const R& rhs) { - printf("RUNNING operator/= with CUDA\n"); + template + std::enable_if_t,Field3D&> operator/=(const R& rhs) { + //printf("RUNNING operator/= with CUDA\n"); if (data.unique()) { // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. @@ -558,7 +578,23 @@ public: return *this; } - Field3D& operator/=(const Field2D& rhs); + //Field3D& operator/=(const Field2D& rhs); + template +std::enable_if_t, Field3D&> operator/=(const R& rhs) { + //printf("RUNNING operator/= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) / rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + + return *this; + } Field3D& operator/=(BoutReal rhs); ///@} @@ -634,16 +670,17 @@ template operator+(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Add{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } template operator-(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } -template && is_expr_field3d_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + //std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; } template operator/(const L& lhs, const R& rhs) { auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{(lhs), - (rhs), - bout::op::Div{}, + //std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + (regionID.has_value() + ? lhs.getMesh()->getRegion(regionID.value()) + : lhs.getMesh()->getRegion("RGN_ALL"))}; +} + +Field3D operator+(const Field3D& lhs, const Field2D& rhs); +#if 0 +template && is_expr_field2d_v, + BinaryExpr>> +BinaryExpr operator+(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + std::cout << "RUNNING Field3D + Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Add{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), regionID, - (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + rhs.getRegion("RGN_ALL")}; } - -Field3D operator+(const Field3D& lhs, const Field2D& rhs); -//template -//auto operator+(const L& lhs, const R& rhs) -// -> std::enable_if_t && is_expr_field2d_v, -// BinaryExpr> { -// static_assert(always_false || always_false, "Hello"); -// auto regionID = lhs.getRegionID(); -// -// std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; -// int mesh_nz = lhs.getMesh()->LocalNz; -// auto LambdaOp = [mesh_nz]() { -// }; -// return BinaryExpr{(lhs), -// (rhs), -// bout::op::Add{}, -// lhs.getMesh(), -// lhs.getLocation(), -// lhs.getDirections(), -// regionID, -// rhs.getRegion("RGN_ALL")}; -//} +#endif Field3D operator-(const Field3D& lhs, const Field2D& rhs); -Field3D operator*(const Field3D& lhs, const Field2D& rhs); -Field3D operator/(const Field3D& lhs, const Field2D& rhs); +//Field3D operator*(const Field3D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} +//Field3D operator/(const Field3D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator/(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = lhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs), + static_cast(rhs).setScale(1, mesh_nz), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator+(const Field3D& lhs, BoutReal rhs); Field3D operator-(const Field3D& lhs, BoutReal rhs); @@ -851,15 +933,17 @@ struct is_expr_field3d : std::true_type {}; template <> struct is_expr_field2d : std::true_type {}; -// Any nested BinaryExpr is an expression iff L is -//template -//struct is_expr_field3d> -// : std::true_type {}; - template struct is_expr_field3d> : std::integral_constant>::value> {}; +template +struct is_expr_field2d> + : std::integral_constant>::value> {}; + +//template +//struct is_expr_field3d::View> : is_expr_field3d {}; + //template //struct is_expr_field3d< typename BinaryExpr::View > // : std::integral_constant>::value> {}; diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index e9c87f242f..d83d1eedd1 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -1,4 +1,6 @@ #pragma once +#include "bout/array.hxx" +#include #ifndef BOUT_FIELDOPS_HXX #define BOUT_FIELDOPS_HXX @@ -14,44 +16,96 @@ class Field3D; namespace bout { namespace op { - struct Add { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { - return L(idx) + R(idx); +struct Assign { + int scale = 1; + int offset = 0; + template + __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); + } +}; + +struct Add { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) + R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a + b; + } +}; + struct Sub { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) - R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a - b; } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; - struct Sub { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) - R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a - b; } - }; - struct Mul { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) * R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a * b; } - }; - struct Div { - template - __host__ __device__ inline BoutReal operator()(int idx, const LView &L, const RView &R) const { return L(idx) / R(idx); } - __host__ __device__ inline BoutReal operator()(BoutReal a, BoutReal b) const { return a / b; } + struct Mul { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) * R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a * b; + } + }; + struct Div { + template + __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { + return L(idx) / R(idx); + } + __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + return a / b; + } }; }; }; template -__global__ static void evaluatorExpr(BoutReal* out, const Expr& expr) { +__global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, + const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.size(); i += stride) { - out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion + if (tid >= expr.size()) { + return; // Out of bounds } + int idx = expr.regionIdx(tid); + out[idx] = expr(idx); // single‐pass fusion + //int stride = blockDim.x * gridDim.x; + //for (int i = tid, e = expr.size(); i < e; i += stride) { + // int idx = expr.regionIdx(i); + // out[idx] = expr(idx); // single‐pass fusion + //} } +template +__global__ __launch_bounds__(256) static void evaluatorExprWithResult(Result res, + const Expr expr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= expr.size()) { + return; // Out of bounds + } + int idx = expr.regionIdx(tid); + res[idx] = expr(idx); // single‐pass fusion + //res(idx, expr(idx)); // single‐pass fusion + //res(idx) = expr(idx); // single‐pass fusion + //int stride = blockDim.x * gridDim.x; + //for (int i = tid, e = expr.size(); i < e; i += stride) { + // int idx = expr.regionIdx(i); + // out[idx] = expr(idx); // single‐pass fusion + //} +} + +inline std::unordered_map> regionIndicesCache; + template struct BinaryExpr { - const L &LHS; - const R &RHS; typename L::View lhs; typename R::View rhs; Array indices; @@ -63,16 +117,31 @@ struct BinaryExpr { std::optional regionID; template - BinaryExpr(const L &lhs, const R &rhs, Func f, Mesh* mesh, CELL_LOC location, - DirectionTypes directions, std::optional regionID, + BinaryExpr(const typename L::View& lhs, const typename R::View& rhs, Func f, Mesh* mesh, + CELL_LOC location, DirectionTypes directions, std::optional regionID, const Region& region) - : LHS(lhs), RHS(rhs), lhs(static_cast(lhs)), rhs(static_cast(rhs)), - f(f), mesh(mesh), location(location), directions(directions), regionID(regionID), - indices(region.getIndices().size()) { + //: lhs(static_cast(lhs)), rhs(static_cast(rhs)), + : lhs(lhs), rhs(rhs), f(f), mesh(mesh), location(location), directions(directions), + regionID(regionID), indices(region.getIndices().size()) { // Copy the region indices into the managed array for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } + //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) + // != regionIndicesCache.end()) { + // // If we have already computed the indices for this region, use them + // indices = + // regionIndicesCache[static_cast(const_cast*>(®ion))]; + //} else { + // // Otherwise, compute the indices and store them in the cache + // indices = Array(region.getIndices().size()); + // // Copy the region indices into the managed array + // for (int i = 0; i < indices.size(); ++i) { + // indices[i] = region.getIndices()[i].ind; + // } + // regionIndicesCache[static_cast(const_cast*>(®ion))] = + // indices; + //} } inline int size() const { return indices.size(); } @@ -87,10 +156,21 @@ struct BinaryExpr { const int* indices; int num_indices; Func f; + int scale = 1; + int offset = 0; - __device__ inline int size() const { return num_indices; } - __device__ inline int regionIdx(int idx) const { return indices[idx]; } - __device__ inline BoutReal operator()(int idx) const { + View& setScale(int s) { + scale = s; + return *this; + } + View& setOffset(int o) { + offset = o; + return *this; + } + + __device__ __forceinline__ int size() const { return num_indices; } + __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } + __device__ __forceinline__ BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion //return f(lhs(idx), rhs(idx)); // single‐pass fusion } @@ -109,6 +189,17 @@ struct BinaryExpr { //} } + template + void evaluateWithResult(const Result& res) const { + constexpr int THREADS = 256; + int blocks = (size() + THREADS - 1) / THREADS; + evaluatorExprWithResult<<>>(res, static_cast(*this)); + cudaDeviceSynchronize(); + //for(int i=0; i #include -// Provide the C++ wrapper for multiplication of Field3D and Field3D -Field3D operator*(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by multiplication with Field3D -#if 0 -Field3D& Field3D::operator*=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for division of Field3D and Field3D -Field3D operator/(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by division with Field3D -#if 0 -Field3D& Field3D::operator/=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for addition of Field3D and Field3D -Field3D operator+(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ operator to update Field3D by addition with Field3D -Field3D& Field3D::operator+=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for subtraction of Field3D and Field3D -Field3D operator-(const Field3D& lhs, const Field3D& rhs) { - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID())); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} - -// Provide the C++ operator to update Field3D by subtraction with Field3D -#if 0 -Field3D& Field3D::operator-=(const Field3D& rhs) { - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - regionID = fieldmesh->getCommonRegion(regionID, rhs.regionID); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { - (*this)[index] -= rhs[index]; - printf("[golden] val[%d] %lf\n", index, (*this)[index]); - } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field3D and Field2D +#if 0 Field3D operator*(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -215,9 +30,11 @@ Field3D operator*(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by multiplication with Field2D Field3D& Field3D::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -245,8 +62,10 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { return *this; } +#if 1 // Provide the C++ wrapper for division of Field3D and Field2D Field3D operator/(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -268,9 +87,12 @@ Field3D operator/(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif +#if 0 // Provide the C++ operator to update Field3D by division with Field2D Field3D& Field3D::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -298,9 +120,11 @@ Field3D& Field3D::operator/=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -324,6 +148,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -353,6 +178,7 @@ Field3D& Field3D::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of Field3D and Field2D Field3D operator-(const Field3D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(lhs)}; @@ -376,6 +202,7 @@ Field3D operator-(const Field3D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field3D by subtraction with Field2D Field3D& Field3D::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -405,6 +232,7 @@ Field3D& Field3D::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of Field3D and FieldPerp FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -425,6 +253,7 @@ FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of Field3D and FieldPerp FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -445,6 +274,7 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of Field3D and FieldPerp FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -465,6 +295,7 @@ FieldPerp operator+(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of Field3D and FieldPerp FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -485,6 +316,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of Field3D and BoutReal Field3D operator*(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -502,6 +334,7 @@ Field3D operator*(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by multiplication with BoutReal Field3D& Field3D::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -525,6 +358,7 @@ Field3D& Field3D::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -543,6 +377,7 @@ Field3D operator/(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by division with BoutReal Field3D& Field3D::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -567,6 +402,7 @@ Field3D& Field3D::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of Field3D and BoutReal Field3D operator+(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -584,6 +420,7 @@ Field3D operator+(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by addition with BoutReal Field3D& Field3D::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -607,6 +444,7 @@ Field3D& Field3D::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of Field3D and BoutReal Field3D operator-(const Field3D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(lhs)}; checkData(lhs); @@ -624,6 +462,7 @@ Field3D operator-(const Field3D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field3D by subtraction with BoutReal Field3D& Field3D::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -647,6 +486,7 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of Field2D and Field3D Field3D operator*(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -670,6 +510,7 @@ Field3D operator*(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -693,6 +534,7 @@ Field3D operator/(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for addition of Field2D and Field3D Field3D operator+(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -716,6 +558,7 @@ Field3D operator+(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for subtraction of Field2D and Field3D Field3D operator-(const Field2D& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field3D result{emptyFrom(rhs)}; @@ -739,6 +582,7 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { // Provide the C++ wrapper for multiplication of Field2D and Field2D Field2D operator*(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -755,6 +599,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -775,6 +620,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { // Provide the C++ wrapper for division of Field2D and Field2D Field2D operator/(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -791,6 +637,7 @@ Field2D operator/(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -811,6 +658,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { // Provide the C++ wrapper for addition of Field2D and Field2D Field2D operator+(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -827,6 +675,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -847,6 +696,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of Field2D and Field2D Field2D operator-(const Field2D& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); Field2D result{emptyFrom(lhs)}; @@ -863,6 +713,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -883,6 +734,7 @@ Field2D& Field2D::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -903,6 +755,7 @@ FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of Field2D and FieldPerp FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -923,6 +776,7 @@ FieldPerp operator/(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of Field2D and FieldPerp FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -943,6 +797,7 @@ FieldPerp operator+(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of Field2D and FieldPerp FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(rhs)}; @@ -963,6 +818,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of Field2D and BoutReal Field2D operator*(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -978,6 +834,7 @@ Field2D operator*(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by multiplication with BoutReal Field2D& Field2D::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -997,6 +854,7 @@ Field2D& Field2D::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of Field2D and BoutReal Field2D operator/(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1013,6 +871,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by division with BoutReal Field2D& Field2D::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1033,6 +892,7 @@ Field2D& Field2D::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1048,6 +908,7 @@ Field2D operator+(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by addition with BoutReal Field2D& Field2D::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1067,6 +928,7 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of Field2D and BoutReal Field2D operator-(const Field2D& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(lhs)}; checkData(lhs); @@ -1082,6 +944,7 @@ Field2D operator-(const Field2D& lhs, const BoutReal rhs) { // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1101,6 +964,7 @@ Field2D& Field2D::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1121,6 +985,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field3D FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1147,6 +1012,7 @@ FieldPerp& FieldPerp::operator*=(const Field3D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field3D FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1167,6 +1033,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field3D FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1193,6 +1060,7 @@ FieldPerp& FieldPerp::operator/=(const Field3D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field3D FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1213,6 +1081,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field3D FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1239,6 +1108,7 @@ FieldPerp& FieldPerp::operator+=(const Field3D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field3D FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1259,6 +1129,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field3D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field3D FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1285,6 +1156,7 @@ FieldPerp& FieldPerp::operator-=(const Field3D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and Field2D FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1305,6 +1177,7 @@ FieldPerp operator*(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with Field2D FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1331,6 +1204,7 @@ FieldPerp& FieldPerp::operator*=(const Field2D& rhs) { // Provide the C++ wrapper for division of FieldPerp and Field2D FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1351,6 +1225,7 @@ FieldPerp operator/(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by division with Field2D FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1377,6 +1252,7 @@ FieldPerp& FieldPerp::operator/=(const Field2D& rhs) { // Provide the C++ wrapper for addition of FieldPerp and Field2D FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1397,6 +1273,7 @@ FieldPerp operator+(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by addition with Field2D FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1423,6 +1300,7 @@ FieldPerp& FieldPerp::operator+=(const Field2D& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and Field2D FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1443,6 +1321,7 @@ FieldPerp operator-(const FieldPerp& lhs, const Field2D& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with Field2D FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1469,6 +1348,7 @@ FieldPerp& FieldPerp::operator-=(const Field2D& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and FieldPerp FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1485,6 +1365,7 @@ FieldPerp operator*(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by multiplication with FieldPerp FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1505,6 +1386,7 @@ FieldPerp& FieldPerp::operator*=(const FieldPerp& rhs) { // Provide the C++ wrapper for division of FieldPerp and FieldPerp FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1521,6 +1403,7 @@ FieldPerp operator/(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by division with FieldPerp FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1541,6 +1424,7 @@ FieldPerp& FieldPerp::operator/=(const FieldPerp& rhs) { // Provide the C++ wrapper for addition of FieldPerp and FieldPerp FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1557,6 +1441,7 @@ FieldPerp operator+(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by addition with FieldPerp FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1577,6 +1462,7 @@ FieldPerp& FieldPerp::operator+=(const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and FieldPerp FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); FieldPerp result{emptyFrom(lhs)}; @@ -1593,6 +1479,7 @@ FieldPerp operator-(const FieldPerp& lhs, const FieldPerp& rhs) { // Provide the C++ operator to update FieldPerp by subtraction with FieldPerp FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1613,6 +1500,7 @@ FieldPerp& FieldPerp::operator-=(const FieldPerp& rhs) { // Provide the C++ wrapper for multiplication of FieldPerp and BoutReal FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1628,6 +1516,7 @@ FieldPerp operator*(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by multiplication with BoutReal FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1647,6 +1536,7 @@ FieldPerp& FieldPerp::operator*=(const BoutReal rhs) { // Provide the C++ wrapper for division of FieldPerp and BoutReal FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1663,6 +1553,7 @@ FieldPerp operator/(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by division with BoutReal FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1682,6 +1573,7 @@ FieldPerp& FieldPerp::operator/=(const BoutReal rhs) { // Provide the C++ wrapper for addition of FieldPerp and BoutReal FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1697,6 +1589,7 @@ FieldPerp operator+(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by addition with BoutReal FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1716,6 +1609,7 @@ FieldPerp& FieldPerp::operator+=(const BoutReal rhs) { // Provide the C++ wrapper for subtraction of FieldPerp and BoutReal FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(lhs)}; checkData(lhs); @@ -1731,6 +1625,7 @@ FieldPerp operator-(const FieldPerp& lhs, const BoutReal rhs) { // Provide the C++ operator to update FieldPerp by subtraction with BoutReal FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field // otherwise just call the non-inplace version if (data.unique()) { @@ -1750,6 +1645,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { // Provide the C++ wrapper for multiplication of BoutReal and Field3D Field3D operator*(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1767,6 +1663,7 @@ Field3D operator*(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for division of BoutReal and Field3D Field3D operator/(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1784,6 +1681,7 @@ Field3D operator/(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for addition of BoutReal and Field3D Field3D operator+(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1801,6 +1699,7 @@ Field3D operator+(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and Field3D Field3D operator-(const BoutReal lhs, const Field3D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field3D result{emptyFrom(rhs)}; checkData(lhs); @@ -1818,6 +1717,7 @@ Field3D operator-(const BoutReal lhs, const Field3D& rhs) { // Provide the C++ wrapper for multiplication of BoutReal and Field2D Field2D operator*(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1833,6 +1733,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1848,6 +1749,7 @@ Field2D operator/(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1863,6 +1765,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and Field2D Field2D operator-(const BoutReal lhs, const Field2D& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; Field2D result{emptyFrom(rhs)}; checkData(lhs); @@ -1878,6 +1781,7 @@ Field2D operator-(const BoutReal lhs, const Field2D& rhs) { // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1893,6 +1797,7 @@ FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for division of BoutReal and FieldPerp FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1908,6 +1813,7 @@ FieldPerp operator/(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for addition of BoutReal and FieldPerp FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); @@ -1923,6 +1829,7 @@ FieldPerp operator+(const BoutReal lhs, const FieldPerp& rhs) { // Provide the C++ wrapper for subtraction of BoutReal and FieldPerp FieldPerp operator-(const BoutReal lhs, const FieldPerp& rhs) { + std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; FieldPerp result{emptyFrom(rhs)}; checkData(lhs); From 4d64ad2bbbd7bb1f717cf27b91cb85105dd6723d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 30 May 2025 17:37:58 -0700 Subject: [PATCH 11/29] More operators - Working version --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 5 +- include/bout/assert.hxx | 1 + include/bout/bout_types.hxx | 11 ++ include/bout/field2d.hxx | 114 ++++++++++++++++-- include/bout/field3d.hxx | 53 ++++---- include/bout/fieldops.hxx | 47 ++++++-- include/bout/interpolation.hxx | 16 ++- include/bout/utils.hxx | 4 +- src/field/field2d.cxx | 3 +- src/field/generated_fieldops.cxx | 14 ++- src/field/vecops.cxx | 2 +- .../laplace/impls/naulin/naulin_laplace.cxx | 3 +- src/mesh/coordinates.cxx | 32 ++--- src/sys/derivs.cxx | 4 +- 14 files changed, 235 insertions(+), 74 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 8e84901806..d985c3ef9d 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1793,7 +1793,7 @@ class ELMpb : public PhysicsModel { // Calculate coefficient. hyper_mu_x = hyperviscos * metric->g_11 * SQ(metric->dx) - * abs(metric->g11 * D2DX2(U)) / (abs(U) + 1e-3); + * abs(Field3D{metric->g11 * D2DX2(U)}) / (abs(U) + 1e-3); hyper_mu_x.applyBoundary("dirichlet"); // Set to zero on all boundaries ddt(U) += hyper_mu_x * metric->g11 * D2DX2(U); @@ -1840,7 +1840,8 @@ class ELMpb : public PhysicsModel { ddt(U) -= 0.5 * Upara2 * bracket(Pi0, Dperp2Phi, bm_exb) / B0; Field3D B0phi = B0 * phi; mesh->communicate(B0phi); - Field3D B0phi0 = B0 * phi0; + Field2D res = B0 * phi0; + Field3D B0phi0 = res; mesh->communicate(B0phi0); ddt(U) += 0.5 * Upara2 * bracket(B0phi, Dperp2Pi0, bm_exb) / B0; ddt(U) += 0.5 * Upara2 * bracket(B0phi0, Dperp2Pi, bm_exb) / B0; diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 653c44ed42..2909cfc3c3 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -38,6 +38,7 @@ #if CHECKLEVEL >= 1 #define ASSERT1(condition) \ if (!(condition)) { \ + abort(); \ throw BoutException("Assertion failed in {:s}, line {:d}: {:s}", __FILE__, __LINE__, \ #condition); \ } diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index c1f06fca7c..b2f38b61aa 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -140,4 +140,15 @@ struct enumWrapper { /// Boundary condition function using FuncPtr = BoutReal (*)(BoutReal t, BoutReal x, BoutReal y, BoutReal z); +template +struct Constant { + T val; + struct View { + T v; + View(T v) : v(v) {} + __device__ T operator()(int) const { return v; } + }; + operator View() const { return {val}; } +}; + #endif // BOUT_TYPES_H diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index db2ce194f8..9d9948296e 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -38,6 +38,8 @@ class Field2D; #include "bout/region.hxx" #include "bout/unused.hxx" +#include "bout/fieldops.hxx" + #if BOUT_HAS_RAJA #include "RAJA/RAJA.hpp" // using RAJA lib #endif @@ -45,6 +47,16 @@ class Field2D; class Field3D; class Mesh; +//template +//struct is_expr_field2d : std::false_type {}; + +//template +//inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; + +template +struct is_expr_field2d> + : std::integral_constant>::value && is_expr_field2d_v>> {}; + /*! * \brief 2D X-Y scalar fields * @@ -91,6 +103,14 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Average}); + template && is_expr_field2d_v>> + Field2D(const BinaryExpr& expr) { + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), + expr.getDirections()}); + } /*! * Destructor */ @@ -166,6 +186,14 @@ public: */ Field2D& operator=(BoutReal rhs); + template + std::enable_if_t, Field2D&> + operator=(BinaryExpr& expr) { + std::cout << "RUNNING Field2D operator= with CUDA\n"; + expr.evaluate(&data[0]); + return *this; + } + ///////////////////////////////////////////////////////// // Data access @@ -310,30 +338,90 @@ private: // Non-member overloaded operators -Field2D operator+(const Field2D& lhs, const Field2D& rhs); -Field2D operator-(const Field2D& lhs, const Field2D& rhs); -Field2D operator*(const Field2D& lhs, const Field2D& rhs); -#if 0 -template && is_expr_field2d_v>> -BinaryExpr operator*(const L& lhs, const R& rhs) { +//Field2D operator+(const Field2D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator+(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Add{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator-(const Field2D& lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator-(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator*(const Field2D& lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { return BinaryExpr{static_cast(lhs), static_cast(rhs), bout::op::Mul{}, lhs.getMesh(), lhs.getLocation(), lhs.getDirections(), - lhs.getRegionID(), - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif +//Field2D operator/(const Field2D& lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr> +operator/(const L& lhs, const R& rhs) { + return BinaryExpr{static_cast(lhs), + static_cast(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; } #endif -Field2D operator/(const Field2D& lhs, const Field2D& rhs); Field3D operator+(const Field2D& lhs, const Field3D& rhs); Field3D operator-(const Field2D& lhs, const Field3D& rhs); -Field3D operator*(const Field2D& lhs, const Field3D& rhs); +//Field3D operator*(const Field2D& lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = rhs.getRegionID(); + + //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; + int mesh_nz = rhs.getMesh()->LocalNz; + + return BinaryExpr{ + static_cast(lhs).setScale(1, mesh_nz), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 33e42cccbb..80956cd6f9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -40,19 +40,6 @@ class Field3D; class Mesh; #include "bout/fieldops.hxx" -// Base template: nothing is an expression by default -template -struct is_expr_field3d : std::false_type {}; - -template -struct is_expr_field2d : std::false_type {}; - -// Helper variable template -template -inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; - -template -inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; /// Class for 3D X-Y-Z scalar fields /*! @@ -198,7 +185,8 @@ public: Field3D(Array data, Mesh* localmesh, CELL_LOC location = CELL_CENTRE, DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); - template + template || is_expr_field3d_v>> Field3D(const BinaryExpr& expr) { //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; @@ -475,7 +463,8 @@ public: void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); template - Field3D& operator=(BinaryExpr& expr) { + std::enable_if_t, Field3D&> + operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); //expr.evaluate(&data[0]); @@ -811,7 +800,25 @@ Field3D operator/(const Field3D& lhs, BoutReal rhs); Field3D operator+(BoutReal lhs, const Field3D& rhs); Field3D operator-(BoutReal lhs, const Field3D& rhs); -Field3D operator*(BoutReal lhs, const Field3D& rhs); +//Field3D operator*(BoutReal lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr, R, bout::op::Mul>> +operator*(const L& lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = rhs.getRegionID(); + + return BinaryExpr, R, bout::op::Mul>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} + Field3D operator/(BoutReal lhs, const Field3D& rhs); /*! @@ -935,18 +942,6 @@ struct is_expr_field2d : std::true_type {}; template struct is_expr_field3d> - : std::integral_constant>::value> {}; - -template -struct is_expr_field2d> - : std::integral_constant>::value> {}; - -//template -//struct is_expr_field3d::View> : is_expr_field3d {}; - -//template -//struct is_expr_field3d< typename BinaryExpr::View > -// : std::integral_constant>::value> {}; -// //: is_expr_field3d> {}; + : std::integral_constant>::value || is_expr_field3d_v>> {}; #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index d83d1eedd1..b78dadc315 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -1,18 +1,46 @@ #pragma once -#include "bout/array.hxx" -#include #ifndef BOUT_FIELDOPS_HXX #define BOUT_FIELDOPS_HXX +#include "bout/array.hxx" #include "bout/bout_types.hxx" #include #include +#include +#include class Mesh; class Field3D; +class Field2D; + +template +struct is_expr_field2d : std::false_type {}; + +template +inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; + +// Base template: nothing is an expression by default +template +struct is_expr_field3d : std::false_type {}; + +// Helper variable template +template +inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; + +template +struct is_expr_boutreal : std::false_type {}; + +template +inline constexpr bool is_expr_boutreal_v = is_expr_boutreal>::value; + +template <> +struct is_expr_boutreal : std::true_type {}; + +template +struct is_expr_boutreal> + : std::integral_constant>> {}; -#include namespace bout { namespace op { @@ -144,6 +172,9 @@ struct BinaryExpr { //} } + BinaryExpr& operator=(BinaryExpr const&) = delete; + BinaryExpr& operator=(BinaryExpr&&) = delete; + inline int size() const { return indices.size(); } inline BoutReal operator()(int idx) const { return f(idx, lhs, rhs); // single‐pass fusion @@ -156,11 +187,13 @@ struct BinaryExpr { const int* indices; int num_indices; Func f; - int scale = 1; + int mul = 1; + int div = 1; int offset = 0; - View& setScale(int s) { - scale = s; + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; return *this; } View& setOffset(int o) { @@ -171,7 +204,7 @@ struct BinaryExpr { __device__ __forceinline__ int size() const { return num_indices; } __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } __device__ __forceinline__ BoutReal operator()(int idx) const { - return f(idx, lhs, rhs); // single‐pass fusion + return f((idx * mul) / div, lhs, rhs); // single‐pass fusion //return f(lhs(idx), rhs(idx)); // single‐pass fusion } }; diff --git a/include/bout/interpolation.hxx b/include/bout/interpolation.hxx index 2c7df4472d..85c04cf897 100644 --- a/include/bout/interpolation.hxx +++ b/include/bout/interpolation.hxx @@ -55,7 +55,8 @@ inline BoutReal interp(const stencil& s) { @param[in] region Region where output will be calculated */ template -const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { +std::enable_if_t || bout::utils::is_Field3D_v, const T> +interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_ALL") { AUTO_TRACE(); static_assert(bout::utils::is_Field2D_v || bout::utils::is_Field3D_v, "interp_to must be templated with one of Field2D or Field3D."); @@ -203,4 +204,17 @@ const T interp_to(const T& var, CELL_LOC loc, const std::string region = "RGN_AL return result; } + +template +std::enable_if_t && !bout::utils::is_Field3D_v, const Field3D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field3D{expr}, loc, std::move(rgn) ); +} + +template +std::enable_if_t && !bout::utils::is_Field2D_v, const Field2D> +interp_to(const E &expr, CELL_LOC loc, const std::string rgn = "RGN_ALL") { + return interp_to( Field2D{expr}, loc, std::move(rgn) ); +} + #endif // BOUT_INTERP_H diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index e2ac814e53..42aa761886 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -422,12 +422,12 @@ inline BoutReal randomu() { * i.e. t * t */ template -inline T SQ(const T& t) { +inline auto SQ(const T& t) { return t * t; } template <> -BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { +BOUT_HOST_DEVICE inline auto SQ(const BoutReal& t) { return t * t; } diff --git a/src/field/field2d.cxx b/src/field/field2d.cxx index c8b9ebb689..e5c1d466b7 100644 --- a/src/field/field2d.cxx +++ b/src/field/field2d.cxx @@ -389,7 +389,8 @@ bool operator==(const Field2D& a, const Field2D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return min(abs(a - b)) < 1e-10; + Field2D diff = a - b; + return min(abs(diff)) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field2D& value) { diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 691e450b0c..439320b0ae 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -62,7 +62,7 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { return *this; } -#if 1 +#if 0 // Provide the C++ wrapper for division of Field3D and Field2D Field3D operator/(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -485,6 +485,7 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { } // Provide the C++ wrapper for multiplication of Field2D and Field3D +#if 0 Field3D operator*(const Field2D& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -507,6 +508,7 @@ Field3D operator*(const Field2D& lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { @@ -581,6 +583,7 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { } // Provide the C++ wrapper for multiplication of Field2D and Field2D +#if 0 Field2D operator*(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -596,6 +599,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { @@ -618,6 +622,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for division of Field2D and Field2D Field2D operator/(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -634,6 +639,7 @@ Field2D operator/(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { @@ -657,6 +663,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { } // Provide the C++ wrapper for addition of Field2D and Field2D +#if 0 Field2D operator+(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -672,6 +679,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { @@ -695,6 +703,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { } // Provide the C++ wrapper for subtraction of Field2D and Field2D +#if 0 Field2D operator-(const Field2D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); @@ -710,6 +719,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { @@ -1643,6 +1653,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for multiplication of BoutReal and Field3D Field3D operator*(const BoutReal lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1660,6 +1671,7 @@ Field3D operator*(const BoutReal lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of BoutReal and Field3D Field3D operator/(const BoutReal lhs, const Field3D& rhs) { diff --git a/src/field/vecops.cxx b/src/field/vecops.cxx index 5f34e2af02..95409963f6 100644 --- a/src/field/vecops.cxx +++ b/src/field/vecops.cxx @@ -187,7 +187,7 @@ Field3D Div(const Vector3D& v, CELL_LOC outloc, const std::string& method) { Vector3D vcn = v; vcn.toContravariant(); - auto vcnJy = vcn.y.getCoordinates()->J * vcn.y; + Field3D vcnJy = vcn.y.getCoordinates()->J * vcn.y; if (v.y.hasParallelSlices()) { // If v.y has parallel slices then we are using ShiftedMetric (with // mesh:calcParallelSlices_on_communicate=true) or FCI, so we should calculate diff --git a/src/invert/laplace/impls/naulin/naulin_laplace.cxx b/src/invert/laplace/impls/naulin/naulin_laplace.cxx index e6f68d850d..203b0c0abd 100644 --- a/src/invert/laplace/impls/naulin/naulin_laplace.cxx +++ b/src/invert/laplace/impls/naulin/naulin_laplace.cxx @@ -269,8 +269,9 @@ Field3D LaplaceNaulin::solve(const Field3D& rhs, const Field3D& x0) { delp2solver->setCoefC2(C2coef_DC); // Use this below to normalize error for relative error estimate + Field3D SQField = SQ(rhsOverD); BoutReal RMS_rhsOverD = sqrt(mean( - SQ(rhsOverD), true, + SQField, true, "RGN_NOBNDRY")); // use sqrt(mean(SQ)) to make sure we do not divide by zero at a point BoutReal error_rel = 1e20, error_abs = 1e20, last_error = error_abs; diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 8123720144..9861fe58bf 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -553,7 +553,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) transform.get()); // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(J - Jcalc))); + Field2D diff = J - Jcalc; + output_warn.write("\tMaximum difference in J is {:e}\n", max(abs(diff))); communicate(J); @@ -578,7 +579,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options) Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is {:e}\n", max(abs(diff))); } // Check Bxy @@ -759,8 +761,9 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, transform.get()); + FieldMetric diff = J - Jcalc; // Compare calculated and loaded values - output_warn.write("\tMaximum difference in J is %e\n", max(abs(J - Jcalc))); + output_warn.write("\tMaximum difference in J is %e\n", max(abs(diff))); // Re-evaluate Bxy using new J Bxy = sqrt(g_22) / J; @@ -785,7 +788,8 @@ Coordinates::Coordinates(Mesh* mesh, Options* options, const CELL_LOC loc, Bxy = interpolateAndExtrapolate(Bxy, location, extrapolate_x, extrapolate_y, false, transform.get()); - output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(Bxy - Bcalc))); + FieldMetric diff = Bxy - Bcalc; + output_warn.write("\tMaximum difference in Bxy is %e\n", max(abs(diff))); } // Check Bxy @@ -1029,7 +1033,7 @@ int Coordinates::geometry(bool recalculate_staggered, G3_23 = 0.5 * g13 * (DDZ(g_12) + DDY(g_13) - DDX(g_23)) + 0.5 * g23 * DDZ(g_22) + 0.5 * g33 * DDY(g_33); - auto tmp = J * g12; + FieldMetric tmp = J * g12; communicate(tmp); G1 = (DDX(J * g11) + DDY(tmp) + DDZ(J * g13)) / J; tmp = J * g22; @@ -1268,9 +1272,9 @@ int Coordinates::calcCovariant(const std::string& region) { output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tLocal maximum error in off-diagonal inversion is {:e}\n", maxerr); @@ -1324,9 +1328,9 @@ int Coordinates::calcContravariant(const std::string& region) { output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); - maxerr = BOUTMAX(max(abs(g_11 * g12 + g_12 * g22 + g_13 * g23)), - max(abs(g_11 * g13 + g_12 * g23 + g_13 * g33)), - max(abs(g_12 * g13 + g_22 * g23 + g_23 * g33))); + maxerr = BOUTMAX(max(abs(FieldMetric{g_11 * g12 + g_12 * g22 + g_13 * g23})), + max(abs(FieldMetric{g_11 * g13 + g_12 * g23 + g_13 * g33})), + max(abs(FieldMetric{g_12 * g13 + g_22 * g23 + g_23 * g33}))); output_info.write("\tMaximum error in off-diagonal inversion is {:e}\n", maxerr); return 0; @@ -1339,13 +1343,13 @@ int Coordinates::jacobian() { const bool extrapolate_x = not localmesh->sourceHasXBoundaryGuards(); const bool extrapolate_y = not localmesh->sourceHasYBoundaryGuards(); - auto g = g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 - g22 * g13 * g13 - - g33 * g12 * g12; + auto g = FieldMetric{g11 * g22 * g33 + 2.0 * g12 * g13 * g23 - g11 * g23 * g23 + - g22 * g13 * g13 - g33 * g12 * g12}; // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); - J = 1. / sqrt(g); + J = 1. / sqrt(Field2D{g}); // More robust to extrapolate derived quantities directly, rather than // deriving from extrapolated covariant metric components J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, diff --git a/src/sys/derivs.cxx b/src/sys/derivs.cxx index ee9bcbcc2c..f12c517f82 100644 --- a/src/sys/derivs.cxx +++ b/src/sys/derivs.cxx @@ -173,7 +173,7 @@ Coordinates::FieldMetric D2DX2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DX2(f, outloc, method, region) / SQ(coords->dx); if (coords->non_uniform) { @@ -210,7 +210,7 @@ Coordinates::FieldMetric D2DY2(const Field2D& f, CELL_LOC outloc, const std::string& method, const std::string& region) { Coordinates* coords = f.getCoordinates(outloc); - auto result = + Field2D result = bout::derivatives::index::D2DY2(f, outloc, method, region) / SQ(coords->dy); if (coords->non_uniform) { // Correction for non-uniform f.getMesh() From 5dfa66a75cba94f47aade6c6c63ca29afc0eb099 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 08:13:20 -0700 Subject: [PATCH 12/29] Add more operators --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 4 +- include/bout/assert.hxx | 2 +- include/bout/field2d.hxx | 80 +++++++++++++++---- include/bout/field3d.hxx | 16 ++-- include/bout/fieldops.hxx | 49 +++--------- src/field/generated_fieldops.cxx | 6 ++ src/mesh/coordinates.cxx | 12 +-- 7 files changed, 102 insertions(+), 67 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index d985c3ef9d..ec2ab8e2a7 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -713,7 +713,7 @@ class ELMpb : public PhysicsModel { diamag_phi0 = false; K_H_term = false; } else { - Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(D_s * (x - x0))); + Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(Field2D{D_s * (x - x0)})); } if (sign < 0) { // change flow direction @@ -1213,7 +1213,7 @@ class ELMpb : public PhysicsModel { // Only if not restarting: Check initial perturbation // Set U to zero where P0 < vacuum_pressure - U = where(P0 - vacuum_pressure, U, 0.0); + U = where(Field2D{P0 - vacuum_pressure}, U, 0.0); if (constn0) { ubyn = U; diff --git a/include/bout/assert.hxx b/include/bout/assert.hxx index 2909cfc3c3..954ae8dba0 100644 --- a/include/bout/assert.hxx +++ b/include/bout/assert.hxx @@ -38,9 +38,9 @@ #if CHECKLEVEL >= 1 #define ASSERT1(condition) \ if (!(condition)) { \ - abort(); \ throw BoutException("Assertion failed in {:s}, line {:d}: {:s}", __FILE__, __LINE__, \ #condition); \ + abort(); \ } #else // CHECKLEVEL >= 1 #define ASSERT1(condition) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 9d9948296e..baff508331 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -47,15 +47,14 @@ class Field2D; class Field3D; class Mesh; -//template -//struct is_expr_field2d : std::false_type {}; - -//template -//inline constexpr bool is_expr_field2d_v = is_expr_field2d>::value; - template struct is_expr_field2d> - : std::integral_constant>::value && is_expr_field2d_v>> {}; + : std::integral_constant> + && is_expr_field2d_v>) + || (is_expr_constant_v> + && is_expr_field2d_v>) + || (is_expr_field2d_v> + && is_expr_constant_v>)> {}; /*! * \brief 2D X-Y scalar fields @@ -103,8 +102,11 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Average}); - template && is_expr_field2d_v>> + template < + typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) + || (is_expr_constant_v && is_expr_field2d_v) + || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { Array data{expr.size()}; expr.evaluate(&data[0]); @@ -188,9 +190,13 @@ public: template std::enable_if_t, Field2D&> - operator=(BinaryExpr& expr) { + operator=(const BinaryExpr& expr) { std::cout << "RUNNING Field2D operator= with CUDA\n"; - expr.evaluate(&data[0]); + if (isAllocated()) { + expr.evaluate(&data[0]); + } else { + *this = Field2D{expr}; + } return *this; } @@ -278,7 +284,19 @@ public: /// In-place division. Copy-on-write used if data is shared Field2D& operator/=(const Field2D& rhs); /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(BoutReal rhs); + //Field2D& operator/=(BoutReal rhs); + template >> + Field2D& operator/=(R rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) / rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) / rhs; + } + + return *this; + } // FieldData virtual functions @@ -425,13 +443,47 @@ operator*(const L& lhs, const R& rhs) { Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); -Field2D operator-(const Field2D& lhs, BoutReal rhs); +//Field2D operator-(const Field2D& lhs, BoutReal rhs); +#if 1 +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Sub>> +operator-(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Sub>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Sub{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif Field2D operator*(const Field2D& lhs, BoutReal rhs); Field2D operator/(const Field2D& lhs, BoutReal rhs); Field2D operator+(BoutReal lhs, const Field2D& rhs); Field2D operator-(BoutReal lhs, const Field2D& rhs); -Field2D operator*(BoutReal lhs, const Field2D& rhs); +//Field2D operator*(BoutReal lhs, const Field2D& rhs); +#if 1 +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Mul>> +operator*(L lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + + return BinaryExpr, R, bout::op::Mul>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Mul{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} +#endif Field2D operator/(BoutReal lhs, const Field2D& rhs); /*! diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 80956cd6f9..c11bab0599 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -467,8 +467,12 @@ public: operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); - //expr.evaluate(&data[0]); - expr.evaluateWithResult(static_cast(*this)); + if(isAllocated()) { + expr.evaluate(&data[0]); + } + else { + *this = Field3D{expr}; + } return *this; } @@ -481,15 +485,15 @@ public: Field3D& operator+=(const R& rhs) { //printf("RUNNING operator+= with CUDA\n"); if (data.unique()) { - printf("RUNNING operator+= with CUDA with evaluateWithResult\n"); + //std::cout << "RUNNING Field3D operator+= w/ CUDA" << __FILE__ << " " + // << std::to_string(__LINE__) << "\n"; // Delete existing parallel slices. We don't copy parallel slices, so any // that currently exist will be incorrect. clearParallelSlices(); auto BE = (*this) + rhs; regionID = BE.getRegionID(); - //BE.evaluate(&data[0]); - BE.evaluateWithResult(static_cast(*this)); + BE.evaluate(&data[0]); } else { (*this) = (*this) + rhs; } @@ -802,7 +806,7 @@ Field3D operator+(BoutReal lhs, const Field3D& rhs); Field3D operator-(BoutReal lhs, const Field3D& rhs); //Field3D operator*(BoutReal lhs, const Field3D& rhs); template -std::enable_if_t && is_expr_field3d_v, +std::enable_if_t && is_expr_field3d_v, BinaryExpr, R, bout::op::Mul>> operator*(const L& lhs, const R& rhs) { //static_assert(always_false || always_false, "Hello"); diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index b78dadc315..6ec7947be4 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -29,18 +29,20 @@ template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; template -struct is_expr_boutreal : std::false_type {}; +struct is_expr_constant : std::bool_constant> {}; template -inline constexpr bool is_expr_boutreal_v = is_expr_boutreal>::value; - -template <> -struct is_expr_boutreal : std::true_type {}; +inline constexpr bool is_expr_constant_v = is_expr_constant>::value; template -struct is_expr_boutreal> - : std::integral_constant>> {}; +struct is_expr_constant> + : std::integral_constant>> {}; +// After the specialization… +static_assert(is_expr_constant_v> == true, + "Constant should be recognized as an expr_constant!"); +static_assert(is_expr_constant_v> == true, + "Constant should be recognized as an expr_constant!"); namespace bout { namespace op { @@ -101,7 +103,7 @@ __global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= expr.size()) { - return; // Out of bounds + return; } int idx = expr.regionIdx(tid); out[idx] = expr(idx); // single‐pass fusion @@ -112,24 +114,6 @@ __global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, //} } -template -__global__ __launch_bounds__(256) static void evaluatorExprWithResult(Result res, - const Expr expr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= expr.size()) { - return; // Out of bounds - } - int idx = expr.regionIdx(tid); - res[idx] = expr(idx); // single‐pass fusion - //res(idx, expr(idx)); // single‐pass fusion - //res(idx) = expr(idx); // single‐pass fusion - //int stride = blockDim.x * gridDim.x; - //for (int i = tid, e = expr.size(); i < e; i += stride) { - // int idx = expr.regionIdx(i); - // out[idx] = expr(idx); // single‐pass fusion - //} -} - inline std::unordered_map> regionIndicesCache; template @@ -222,21 +206,10 @@ struct BinaryExpr { //} } - template - void evaluateWithResult(const Result& res) const { - constexpr int THREADS = 256; - int blocks = (size() + THREADS - 1) / THREADS; - evaluatorExprWithResult<<>>(res, static_cast(*this)); - cudaDeviceSynchronize(); - //for(int i=0; i getRegionID() const { return regionID; }; }; -#endif // BOUT_EXPRESSION_HXX \ No newline at end of file +#endif // BOUT_EXPRESSION_HX \ No newline at end of file diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 439320b0ae..9ae40ba41b 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -879,6 +879,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { return result; } +#if 0 // Provide the C++ operator to update Field2D by division with BoutReal Field2D& Field2D::operator/=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -899,6 +900,7 @@ Field2D& Field2D::operator/=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { @@ -937,6 +939,7 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { } // Provide the C++ wrapper for subtraction of Field2D and BoutReal +#if 0 Field2D operator-(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -951,6 +954,7 @@ Field2D operator-(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { @@ -1727,6 +1731,7 @@ Field3D operator-(const BoutReal lhs, const Field3D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for multiplication of BoutReal and Field2D Field2D operator*(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1742,6 +1747,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 9861fe58bf..8bded7fee5 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1266,9 +1266,9 @@ int Coordinates::calcCovariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tLocal maximum error in diagonal inversion is {:e}\n", maxerr); @@ -1322,9 +1322,9 @@ int Coordinates::calcContravariant(const std::string& region) { } BoutReal maxerr; - maxerr = BOUTMAX(max(abs((g_11 * g11 + g_12 * g12 + g_13 * g13) - 1)), - max(abs((g_12 * g12 + g_22 * g22 + g_23 * g23) - 1)), - max(abs((g_13 * g13 + g_23 * g23 + g_33 * g33) - 1))); + maxerr = BOUTMAX(max(abs(FieldMetric{(g_11 * g11 + g_12 * g12 + g_13 * g13) - 1})), + max(abs(FieldMetric{(g_12 * g12 + g_22 * g22 + g_23 * g23) - 1})), + max(abs(FieldMetric{(g_13 * g13 + g_23 * g23 + g_33 * g33) - 1}))); output_info.write("\tMaximum error in diagonal inversion is {:e}\n", maxerr); From 026645a47a43e7f9f0cb748dac893313c2a3ba2f Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 18:30:17 -0700 Subject: [PATCH 13/29] More operators macro for dedup definition per operator --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 3 +- include/bout/field2d.hxx | 127 ++++++++++++++++-- include/bout/field3d.hxx | 80 ++++++++++- include/bout/utils.hxx | 5 +- src/field/generated_fieldops.cxx | 24 ++++ src/mesh/coordinates.cxx | 4 +- src/physics/snb.cxx | 2 +- 7 files changed, 221 insertions(+), 24 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index ec2ab8e2a7..7d38780814 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1031,7 +1031,8 @@ class ELMpb : public PhysicsModel { vacuum_trans *= pnorm; // Transitions from 0 in core to 1 in vacuum - vac_mask = (1.0 - tanh((P0 - vacuum_pressure) / vacuum_trans)) / 2.0; + Field2D tanh_res = tanh(Field2D{(P0 - vacuum_pressure) / vacuum_trans}); + vac_mask = (1.0 - tanh_res) / 2.0; if (spitzer_resist) { // Use Spitzer resistivity diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index baff508331..97946ee713 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -270,17 +270,66 @@ public: } /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(const Field2D& rhs); + //Field2D& operator+=(const Field2D& rhs); + template >> + Field2D& operator+=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) + rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place addition. Copy-on-write used if data is shared Field2D& operator+=(BoutReal rhs); /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(const Field2D& rhs); + //Field2D& operator-=(const Field2D& rhs); + //here1 + template >> + Field2D& operator-=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) - rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place subtraction. Copy-on-write used if data is shared Field2D& operator-=(BoutReal rhs); /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(const Field2D& rhs); + //Field2D& operator*=(const Field2D& rhs); + template >> + Field2D& operator*=(const R& rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) * rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place multiplication. Copy-on-write used if data is shared - Field2D& operator*=(BoutReal rhs); + //Field2D& operator*=(BoutReal rhs); + template >> + Field2D& operator*=(R rhs) { + //printf("RUNNING operator+= with CUDA\n"); + if (data.unique()) { + auto BE = (*this) * rhs; + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } /// In-place division. Copy-on-write used if data is shared Field2D& operator/=(const Field2D& rhs); /// In-place division. Copy-on-write used if data is shared @@ -444,7 +493,6 @@ Field3D operator/(const Field2D& lhs, const Field3D& rhs); Field2D operator+(const Field2D& lhs, BoutReal rhs); //Field2D operator-(const Field2D& lhs, BoutReal rhs); -#if 1 template std::enable_if_t && is_expr_constant_v, BinaryExpr, bout::op::Sub>> @@ -459,14 +507,56 @@ operator-(const L& lhs, R rhs) { std::nullopt, lhs.getMesh()->getRegion2D("RGN_ALL")}; } -#endif -Field2D operator*(const Field2D& lhs, BoutReal rhs); -Field2D operator/(const Field2D& lhs, BoutReal rhs); +//Field2D operator*(const Field2D& lhs, BoutReal rhs); +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Mul>> +operator*(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Mul>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} +//Field2D operator/(const Field2D& lhs, BoutReal rhs); +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Div>> +operator/(const L& lhs, R rhs) { + return BinaryExpr, bout::op::Div>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Div{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + std::nullopt, + lhs.getMesh()->getRegion2D("RGN_ALL")}; +} Field2D operator+(BoutReal lhs, const Field2D& rhs); -Field2D operator-(BoutReal lhs, const Field2D& rhs); +//Field2D operator-(BoutReal lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Sub>> +operator-(L lhs, const R& rhs) { + //static_assert(always_false || always_false, "Hello"); + + return BinaryExpr, R, bout::op::Sub>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Sub{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} //Field2D operator*(BoutReal lhs, const Field2D& rhs); -#if 1 template std::enable_if_t && is_expr_field2d_v, BinaryExpr, R, bout::op::Mul>> @@ -483,8 +573,21 @@ operator*(L lhs, const R& rhs) { std::nullopt, rhs.getMesh()->getRegion2D("RGN_ALL")}; } -#endif -Field2D operator/(BoutReal lhs, const Field2D& rhs); +//Field2D operator/(BoutReal lhs, const Field2D& rhs); +template +std::enable_if_t && is_expr_field2d_v, + BinaryExpr, R, bout::op::Div>> +operator/(L lhs, const R& rhs) { + return BinaryExpr, R, bout::op::Div>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Div{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + std::nullopt, + rhs.getMesh()->getRegion2D("RGN_ALL")}; +} /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index c11bab0599..cab156db79 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -548,7 +548,25 @@ public: return *this; } Field3D& operator*=(const Field2D& rhs); - Field3D& operator*=(BoutReal rhs); + //Field3D& operator*=(BoutReal rhs); + // here1 + template >> + Field3D& operator*=(R rhs) { + //printf("RUNNING operator*= with CUDA\n"); + if (data.unique()) { + // Delete existing parallel slices. We don't copy parallel slices, so any + // that currently exist will be incorrect. + clearParallelSlices(); + + auto BE = (*this) * rhs; + regionID = BE.getRegionID(); + BE.evaluate(&data[0]); + } else { + (*this) = (*this) * rhs; + } + + return *this; + } ///@} /// Division operators @@ -799,17 +817,51 @@ operator/(const L& lhs, const R& rhs) { Field3D operator+(const Field3D& lhs, BoutReal rhs); Field3D operator-(const Field3D& lhs, BoutReal rhs); -Field3D operator*(const Field3D& lhs, BoutReal rhs); +//Field3D operator*(const Field3D& lhs, BoutReal rhs); +//here2 +template +std::enable_if_t && is_expr_constant_v, + BinaryExpr, bout::op::Mul>> +operator*(const L& lhs, R rhs) { + //static_assert(always_false || always_false, "Hello"); + auto regionID = lhs.getRegionID(); + + return BinaryExpr, bout::op::Mul>{ + static_cast(lhs), + static_cast::View>(rhs), + bout::op::Mul{}, + lhs.getMesh(), + lhs.getLocation(), + lhs.getDirections(), + regionID, + lhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator/(const Field3D& lhs, BoutReal rhs); -Field3D operator+(BoutReal lhs, const Field3D& rhs); +//Field3D operator+(BoutReal lhs, const Field3D& rhs); +template +std::enable_if_t && is_expr_field3d_v, + BinaryExpr, R, bout::op::Add>> +operator+(const L& lhs, const R& rhs) { + auto regionID = rhs.getRegionID(); + + return BinaryExpr, R, bout::op::Add>{ + static_cast::View>(lhs), + static_cast(rhs), + bout::op::Add{}, + rhs.getMesh(), + rhs.getLocation(), + rhs.getDirections(), + regionID, + rhs.getMesh()->getRegion("RGN_ALL")}; +} Field3D operator-(BoutReal lhs, const Field3D& rhs); +#if 0 //Field3D operator*(BoutReal lhs, const Field3D& rhs); template std::enable_if_t && is_expr_field3d_v, BinaryExpr, R, bout::op::Mul>> operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); auto regionID = rhs.getRegionID(); return BinaryExpr, R, bout::op::Mul>{ @@ -822,9 +874,29 @@ operator*(const L& lhs, const R& rhs) { regionID, rhs.getMesh()->getRegion("RGN_ALL")}; } +#endif Field3D operator/(BoutReal lhs, const Field3D& rhs); +#define FIELD3D_BOUTREAL_OP(OP_SYM, OP_KIND) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr, R, bout::op::OP_KIND>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + return BinaryExpr, R, bout::op::OP_KIND>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_KIND{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_BOUTREAL_OP(*, Mul) + /*! * Unary minus. Returns the negative of given field, * iterates over whole domain including guard/boundary cells. diff --git a/include/bout/utils.hxx b/include/bout/utils.hxx index 42aa761886..c8383b12fa 100644 --- a/include/bout/utils.hxx +++ b/include/bout/utils.hxx @@ -426,10 +426,7 @@ inline auto SQ(const T& t) { return t * t; } -template <> -BOUT_HOST_DEVICE inline auto SQ(const BoutReal& t) { - return t * t; -} +BOUT_HOST_DEVICE inline BoutReal SQ(const BoutReal& t) { return t * t; } /*! * Round \p x to the nearest integer diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index 9ae40ba41b..4040f8a175 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -315,6 +315,8 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { } // Provide the C++ wrapper for multiplication of Field3D and BoutReal +// here2 +#if 0 Field3D operator*(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -331,8 +333,11 @@ Field3D operator*(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by multiplication with BoutReal +// here1 +#if 0 Field3D& Field3D::operator*=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; // only if data is unique we update the field @@ -355,6 +360,7 @@ Field3D& Field3D::operator*=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { @@ -601,6 +607,7 @@ Field2D operator*(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by multiplication with Field2D Field2D& Field2D::operator*=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -621,6 +628,7 @@ Field2D& Field2D::operator*=(const Field2D& rhs) { } return *this; } +#endif #if 0 // Provide the C++ wrapper for division of Field2D and Field2D @@ -681,6 +689,7 @@ Field2D operator+(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by addition with Field2D Field2D& Field2D::operator+=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -701,6 +710,7 @@ Field2D& Field2D::operator+=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for subtraction of Field2D and Field2D #if 0 @@ -721,6 +731,7 @@ Field2D operator-(const Field2D& lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ operator to update Field2D by subtraction with Field2D Field2D& Field2D::operator-=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -741,6 +752,7 @@ Field2D& Field2D::operator-=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { @@ -826,6 +838,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } +#if 0 // Provide the C++ wrapper for multiplication of Field2D and BoutReal Field2D operator*(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -841,7 +854,9 @@ Field2D operator*(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif +#if 0 // Provide the C++ operator to update Field2D by multiplication with BoutReal Field2D& Field2D::operator*=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -861,7 +876,9 @@ Field2D& Field2D::operator*=(const BoutReal rhs) { } return *this; } +#endif +#if 0 // Provide the C++ wrapper for division of Field2D and BoutReal Field2D operator/(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -878,6 +895,7 @@ Field2D operator/(const Field2D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif #if 0 // Provide the C++ operator to update Field2D by division with BoutReal @@ -1695,6 +1713,7 @@ Field3D operator/(const BoutReal lhs, const Field3D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for addition of BoutReal and Field3D Field3D operator+(const BoutReal lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1712,6 +1731,7 @@ Field3D operator+(const BoutReal lhs, const Field3D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for subtraction of BoutReal and Field3D Field3D operator-(const BoutReal lhs, const Field3D& rhs) { @@ -1749,6 +1769,7 @@ Field2D operator*(const BoutReal lhs, const Field2D& rhs) { } #endif +#if 0 // Provide the C++ wrapper for division of BoutReal and Field2D Field2D operator/(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1764,6 +1785,7 @@ Field2D operator/(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { @@ -1781,6 +1803,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { return result; } +#if 0 // Provide the C++ wrapper for subtraction of BoutReal and Field2D Field2D operator-(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1796,6 +1819,7 @@ Field2D operator-(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index 8bded7fee5..b8fd33c019 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1102,7 +1102,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x" + suffix, 0.0, false, location)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = @@ -1156,7 +1156,7 @@ int Coordinates::geometry(bool recalculate_staggered, if (localmesh->get(d2x, "d2x", 0.0, false)) { output_warn.write( "\tWARNING: differencing quantity 'd2x' not found. Calculating from dx\n"); - d1_dx = bout::derivatives::index::DDX(1. / dx); // d/di(1/dx) + d1_dx = bout::derivatives::index::DDX(FieldMetric{1. / dx}); // d/di(1/dx) communicate(d1_dx); d1_dx = diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index 80da9e1bf8..475b12ca1a 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -14,7 +14,7 @@ Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); - Field3D coulomb_log = 6.6 - 0.5 * log(Ne * 1e-20) + 1.5 * log(Te); + Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); // Thermal electron-electron mean free path [m] Field3D lambda_ee_T = pow(thermal_speed, 4) / (Y * Ne * coulomb_log); From 338920e284e39ae7f9954a0adcd3e87cb6444992 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 21:17:34 -0700 Subject: [PATCH 14/29] More operators --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 2 +- include/bout/field3d.hxx | 129 +++-- include/bout/fieldops.hxx | 14 +- src/field/generated_fieldops.cxx | 541 +----------------- src/physics/snb.cxx | 2 +- 5 files changed, 76 insertions(+), 612 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 7d38780814..901e57bf97 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -1283,7 +1283,7 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum if (nonlinear) { - vac_mask = (1.0 - tanh(((P0 + P) - vacuum_pressure) / vacuum_trans)) / 2.0; + vac_mask = (1.0 - tanh(Field3D{((P0 + P) - vacuum_pressure) / vacuum_trans})) / 2.0; // Update resistivity if (spitzer_resist) { diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index cab156db79..d4dd1397a7 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -749,7 +749,7 @@ BinaryExpr operator/(const L& lhs, const R& rhs) { : lhs.getMesh()->getRegion("RGN_ALL"))}; } -Field3D operator+(const Field3D& lhs, const Field2D& rhs); +//Field3D operator+(const Field3D& lhs, const Field2D& rhs); #if 0 template && is_expr_field2d_v, @@ -771,8 +771,9 @@ BinaryExpr operator+(const L& lhs, const R& rhs) { rhs.getRegion("RGN_ALL")}; } #endif -Field3D operator-(const Field3D& lhs, const Field2D& rhs); +//Field3D operator-(const Field3D& lhs, const Field2D& rhs); //Field3D operator*(const Field3D& lhs, const Field2D& rhs); +#if 0 template std::enable_if_t && is_expr_field2d_v, BinaryExpr> @@ -793,7 +794,9 @@ operator*(const L& lhs, const R& rhs) { regionID, lhs.getMesh()->getRegion("RGN_ALL")}; } +#endif //Field3D operator/(const Field3D& lhs, const Field2D& rhs); +#if 0 template std::enable_if_t && is_expr_field2d_v, BinaryExpr> @@ -814,80 +817,73 @@ operator/(const L& lhs, const R& rhs) { regionID, lhs.getMesh()->getRegion("RGN_ALL")}; } +#endif -Field3D operator+(const Field3D& lhs, BoutReal rhs); -Field3D operator-(const Field3D& lhs, BoutReal rhs); +#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + auto regionID = lhs.getRegionID(); \ + int mesh_nz = lhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs).setScale(1, mesh_nz), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_FIELD3D_FIELD2D_OP(+, Add) +FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD2D_OP(/, Div) + +//Field3D operator+(const Field3D& lhs, BoutReal rhs); +//Field3D operator-(const Field3D& lhs, BoutReal rhs); //Field3D operator*(const Field3D& lhs, BoutReal rhs); -//here2 -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Mul>> -operator*(const L& lhs, R rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); +//Field3D operator/(const Field3D& lhs, BoutReal rhs); - return BinaryExpr, bout::op::Mul>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator/(const Field3D& lhs, BoutReal rhs); +#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + auto regionID = lhs.getRegionID(); \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ + } + +FIELD3D_FIELD3D_BOUTREAL_OP(+, Add) +FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) +FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) +FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) //Field3D operator+(BoutReal lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr, R, bout::op::Add>> -operator+(const L& lhs, const R& rhs) { - auto regionID = rhs.getRegionID(); - - return BinaryExpr, R, bout::op::Add>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Add{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator-(BoutReal lhs, const Field3D& rhs); -#if 0 +//Field3D operator-(BoutReal lhs, const Field3D& rhs); //Field3D operator*(BoutReal lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr, R, bout::op::Mul>> -operator*(const L& lhs, const R& rhs) { - auto regionID = rhs.getRegionID(); - - return BinaryExpr, R, bout::op::Mul>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif - -Field3D operator/(BoutReal lhs, const Field3D& rhs); +//Field3D operator/(BoutReal lhs, const Field3D& rhs); -#define FIELD3D_BOUTREAL_OP(OP_SYM, OP_KIND) \ +#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr, R, bout::op::OP_KIND>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ auto regionID = rhs.getRegionID(); \ - return BinaryExpr, R, bout::op::OP_KIND>{ \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ static_cast::View>(lhs), \ static_cast(rhs), \ - bout::op::OP_KIND{}, \ + bout::op::OP_TYPE{}, \ rhs.getMesh(), \ rhs.getLocation(), \ rhs.getDirections(), \ @@ -895,7 +891,10 @@ Field3D operator/(BoutReal lhs, const Field3D& rhs); rhs.getMesh()->getRegion("RGN_ALL")}; \ } -FIELD3D_BOUTREAL_OP(*, Mul) +FIELD3D_BOUTREAL_FIELD3D_OP(+, Add) +FIELD3D_BOUTREAL_FIELD3D_OP(-, Sub) +FIELD3D_BOUTREAL_FIELD3D_OP(*, Mul) +FIELD3D_BOUTREAL_FIELD3D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 6ec7947be4..0de73d1205 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -38,12 +38,7 @@ template struct is_expr_constant> : std::integral_constant>> {}; -// After the specialization… -static_assert(is_expr_constant_v> == true, - "Constant should be recognized as an expr_constant!"); -static_assert(is_expr_constant_v> == true, - "Constant should be recognized as an expr_constant!"); - +constexpr int THREADS = 256; namespace bout { namespace op { struct Assign { @@ -99,8 +94,7 @@ struct Add { }; template -__global__ __launch_bounds__(256) static void evaluatorExpr(BoutReal* out, - const Expr expr) { +__global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= expr.size()) { return; @@ -197,13 +191,9 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { - constexpr int THREADS = 256; int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); - //for(int i=0; i #include -// Provide the C++ wrapper for multiplication of Field3D and Field2D -#if 0 -Field3D operator*(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field3D by multiplication with Field2D Field3D& Field3D::operator*=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -63,65 +37,6 @@ Field3D& Field3D::operator*=(const Field2D& rhs) { } #if 0 -// Provide the C++ wrapper for division of Field3D and Field2D -Field3D operator/(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] * tmp; - } - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field3D by division with Field2D -Field3D& Field3D::operator/=(const Field2D& rhs) { - std::cout << "RUNNING operator "<< __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - const auto tmp = 1.0 / rhs[index]; - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= tmp; - } - } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for addition of Field3D and Field2D Field3D operator+(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -145,6 +60,7 @@ Field3D operator+(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with Field2D Field3D& Field3D::operator+=(const Field2D& rhs) { @@ -176,6 +92,7 @@ Field3D& Field3D::operator+=(const Field2D& rhs) { return *this; } +#if 0 // Provide the C++ wrapper for subtraction of Field3D and Field2D Field3D operator-(const Field3D& lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -199,6 +116,7 @@ Field3D operator-(const Field3D& lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by subtraction with Field2D Field3D& Field3D::operator-=(const Field2D& rhs) { @@ -314,54 +232,7 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for multiplication of Field3D and BoutReal -// here2 -#if 0 -Field3D operator*(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by multiplication with BoutReal -// here1 #if 0 -Field3D& Field3D::operator*=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for division of Field3D and BoutReal Field3D operator/(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -380,6 +251,7 @@ Field3D operator/(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by division with BoutReal Field3D& Field3D::operator/=(const BoutReal rhs) { @@ -406,6 +278,7 @@ Field3D& Field3D::operator/=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of Field3D and BoutReal Field3D operator+(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -423,6 +296,7 @@ Field3D operator+(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by addition with BoutReal Field3D& Field3D::operator+=(const BoutReal rhs) { @@ -448,6 +322,7 @@ Field3D& Field3D::operator+=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for subtraction of Field3D and BoutReal Field3D operator-(const Field3D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -465,6 +340,7 @@ Field3D operator-(const Field3D& lhs, const BoutReal rhs) { checkData(result); return result; } +#endif // Provide the C++ operator to update Field3D by subtraction with BoutReal Field3D& Field3D::operator-=(const BoutReal rhs) { @@ -490,32 +366,6 @@ Field3D& Field3D::operator-=(const BoutReal rhs) { return *this; } -// Provide the C++ wrapper for multiplication of Field2D and Field3D -#if 0 -Field3D operator*(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] * rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for division of Field2D and Field3D Field3D operator/(const Field2D& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -588,67 +438,6 @@ Field3D operator-(const Field2D& lhs, const Field3D& rhs) { return result; } -// Provide the C++ wrapper for multiplication of Field2D and Field2D -#if 0 -Field2D operator*(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by multiplication with Field2D -Field2D& Field2D::operator*=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of Field2D and Field2D -Field2D operator/(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] / rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -670,90 +459,6 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { return *this; } -// Provide the C++ wrapper for addition of Field2D and Field2D -#if 0 -Field2D operator+(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by addition with Field2D -Field2D& Field2D::operator+=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -// Provide the C++ wrapper for subtraction of Field2D and Field2D -#if 0 -Field2D operator-(const Field2D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by subtraction with Field2D -Field2D& Field2D::operator-=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -838,88 +543,6 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ wrapper for multiplication of Field2D and BoutReal -Field2D operator*(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * rhs; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by multiplication with BoutReal -Field2D& Field2D::operator*=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of Field2D and BoutReal -Field2D operator/(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by division with BoutReal -Field2D& Field2D::operator/=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for addition of Field2D and BoutReal Field2D operator+(const Field2D& lhs, const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -956,24 +579,6 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { return *this; } -// Provide the C++ wrapper for subtraction of Field2D and BoutReal -#if 0 -Field2D operator-(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1675,118 +1280,6 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } -#if 0 -// Provide the C++ wrapper for multiplication of BoutReal and Field3D -Field3D operator*(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ wrapper for division of BoutReal and Field3D -Field3D operator/(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ wrapper for addition of BoutReal and Field3D -Field3D operator+(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ wrapper for subtraction of BoutReal and Field3D -Field3D operator-(const BoutReal lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} - -#if 0 -// Provide the C++ wrapper for multiplication of BoutReal and Field2D -Field2D operator*(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs * rhs[index]; - } - - checkData(result); - return result; -} -#endif - -#if 0 -// Provide the C++ wrapper for division of BoutReal and Field2D -Field2D operator/(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs / rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1803,24 +1296,6 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { return result; } -#if 0 -// Provide the C++ wrapper for subtraction of BoutReal and Field2D -Field2D operator-(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs - rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index 475b12ca1a..f21bfb7ee0 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -11,7 +11,7 @@ namespace bout { Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D* Div_Q_SH_out) { - Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); + Field3D thermal_speed = sqrt(Field3D{2. * SI::qe * Te / SI::Me}); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); From 6e8818114e398258ca548d3b85e061d92c78264e Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sat, 31 May 2025 23:25:47 -0700 Subject: [PATCH 15/29] More operators and cleanup --- include/bout/field2d.hxx | 25 ++++- include/bout/field3d.hxx | 157 +++++-------------------------- src/field/generated_fieldops.cxx | 2 + 3 files changed, 47 insertions(+), 137 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 97946ee713..b7911b7e76 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -538,7 +538,8 @@ operator/(const L& lhs, R rhs) { lhs.getMesh()->getRegion2D("RGN_ALL")}; } -Field2D operator+(BoutReal lhs, const Field2D& rhs); +#if 0 +//Field2D operator+(BoutReal lhs, const Field2D& rhs); //Field2D operator-(BoutReal lhs, const Field2D& rhs); template std::enable_if_t && is_expr_field2d_v, @@ -588,6 +589,28 @@ operator/(L lhs, const R& rhs) { std::nullopt, rhs.getMesh()->getRegion2D("RGN_ALL")}; } +#endif + +#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(L lhs, const R & rhs) { \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + std::nullopt, \ + rhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_BOUTREAL_FIELD2D_OP(+, Add) +FIELD2D_BOUTREAL_FIELD2D_OP(-, Sub) +FIELD2D_BOUTREAL_FIELD2D_OP(*, Mul) +FIELD2D_BOUTREAL_FIELD2D_OP(/, Div) /*! * Unary minus. Returns the negative of given field, diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index d4dd1397a7..8f433909f9 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -676,148 +676,33 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs); FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); -template && is_expr_field3d_v>> -BinaryExpr operator+(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator+ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} - -template && is_expr_field3d_v>> -BinaryExpr operator-(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator- using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} - -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator* using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} +#define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template && is_expr_field3d_v>> \ + BinaryExpr operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = \ + lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + (regionID.has_value() ? lhs.getMesh()->getRegion(regionID.value()) \ + : lhs.getMesh()->getRegion("RGN_ALL"))}; \ + } -template && is_expr_field3d_v>> -BinaryExpr operator/(const L& lhs, const R& rhs) { - auto regionID = lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); - - //std::cout << "RUNNING operator/ using BinaryExpr with CUDA" << "\n"; - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - (regionID.has_value() - ? lhs.getMesh()->getRegion(regionID.value()) - : lhs.getMesh()->getRegion("RGN_ALL"))}; -} +FIELD3D_FIELD3D_FIELD3D_OP(+, Add) +FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD3D_FIELD3D_OP(/, Div) //Field3D operator+(const Field3D& lhs, const Field2D& rhs); -#if 0 -template && is_expr_field2d_v, - BinaryExpr>> -BinaryExpr operator+(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - std::cout << "RUNNING Field3D + Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - rhs.getRegion("RGN_ALL")}; -} -#endif //Field3D operator-(const Field3D& lhs, const Field2D& rhs); //Field3D operator*(const Field3D& lhs, const Field2D& rhs); -#if 0 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif //Field3D operator/(const Field3D& lhs, const Field2D& rhs); -#if 0 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator/(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = lhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = lhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs), - static_cast(rhs).setScale(1, mesh_nz), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - regionID, - lhs.getMesh()->getRegion("RGN_ALL")}; -} -#endif #define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index c69597c8db..dca1605b91 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -1280,6 +1280,7 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } +#if 0 // Provide the C++ wrapper for addition of BoutReal and Field2D Field2D operator+(const BoutReal lhs, const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -1295,6 +1296,7 @@ Field2D operator+(const BoutReal lhs, const Field2D& rhs) { checkData(result); return result; } +#endif // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { From 25d1272acdae941d8141fb671a97a790438cb0b5 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sun, 1 Jun 2025 00:42:44 -0700 Subject: [PATCH 16/29] More operators and cleanup --- include/bout/field2d.hxx | 334 ++++++-------------------- include/bout/field3d.hxx | 158 ++----------- src/field/generated_fieldops.cxx | 386 +------------------------------ 3 files changed, 98 insertions(+), 780 deletions(-) diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index b7911b7e76..f430b98c6c 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -269,83 +269,23 @@ public: return operator()(jx, jy); } - /// In-place addition. Copy-on-write used if data is shared - //Field2D& operator+=(const Field2D& rhs); - template >> - Field2D& operator+=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) + rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; - } - /// In-place addition. Copy-on-write used if data is shared - Field2D& operator+=(BoutReal rhs); - /// In-place subtraction. Copy-on-write used if data is shared - //Field2D& operator-=(const Field2D& rhs); - //here1 - template >> - Field2D& operator-=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) - rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; +#define FIELD2D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t || is_expr_constant_v, Field2D&> \ + operator OP_SYM##=(R rhs) { \ + if (data.unique()) { \ + auto BE = (*this)OP_SYM rhs; \ + BE.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } - /// In-place subtraction. Copy-on-write used if data is shared - Field2D& operator-=(BoutReal rhs); - /// In-place multiplication. Copy-on-write used if data is shared - //Field2D& operator*=(const Field2D& rhs); - template >> - Field2D& operator*=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) * rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - return *this; - } - /// In-place multiplication. Copy-on-write used if data is shared - //Field2D& operator*=(BoutReal rhs); - template >> - Field2D& operator*=(R rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) * rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; - } - /// In-place division. Copy-on-write used if data is shared - Field2D& operator/=(const Field2D& rhs); - /// In-place division. Copy-on-write used if data is shared - //Field2D& operator/=(BoutReal rhs); - template >> - Field2D& operator/=(R rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - auto BE = (*this) / rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } + FIELD2D_OP_EQUALS(+) + FIELD2D_OP_EQUALS(-) + FIELD2D_OP_EQUALS(*) + FIELD2D_OP_EQUALS(/) // FieldData virtual functions @@ -405,191 +345,69 @@ private: // Non-member overloaded operators -//Field2D operator+(const Field2D& lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator+(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Add{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator-(const Field2D& lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator-(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(const Field2D& lhs, const Field2D& rhs); -#if 1 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif -//Field2D operator/(const Field2D& lhs, const Field2D& rhs); -#if 1 -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr> -operator/(const L& lhs, const R& rhs) { - return BinaryExpr{static_cast(lhs), - static_cast(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif +#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + return BinaryExpr{static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } -Field3D operator+(const Field2D& lhs, const Field3D& rhs); -Field3D operator-(const Field2D& lhs, const Field3D& rhs); -//Field3D operator*(const Field2D& lhs, const Field3D& rhs); -template -std::enable_if_t && is_expr_field3d_v, - BinaryExpr> -operator*(const L& lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - auto regionID = rhs.getRegionID(); - - //std::cout << "RUNNING Field3D * Field2D using BinaryExpr with CUDA" << "\n"; - int mesh_nz = rhs.getMesh()->LocalNz; - - return BinaryExpr{ - static_cast(lhs).setScale(1, mesh_nz), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - regionID, - rhs.getMesh()->getRegion("RGN_ALL")}; -} -Field3D operator/(const Field2D& lhs, const Field3D& rhs); - -Field2D operator+(const Field2D& lhs, BoutReal rhs); -//Field2D operator-(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Sub>> -operator-(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Sub>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Sub{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Mul>> -operator*(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Mul>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Mul{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator/(const Field2D& lhs, BoutReal rhs); -template -std::enable_if_t && is_expr_constant_v, - BinaryExpr, bout::op::Div>> -operator/(const L& lhs, R rhs) { - return BinaryExpr, bout::op::Div>{ - static_cast(lhs), - static_cast::View>(rhs), - bout::op::Div{}, - lhs.getMesh(), - lhs.getLocation(), - lhs.getDirections(), - std::nullopt, - lhs.getMesh()->getRegion2D("RGN_ALL")}; -} +FIELD2D_FIELD2D_FIELD2D_OP(+, Add) +FIELD2D_FIELD2D_FIELD2D_OP(-, Sub) +FIELD2D_FIELD2D_FIELD2D_OP(*, Mul) +FIELD2D_FIELD2D_FIELD2D_OP(/, Div) + +#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr> operator OP_SYM(const L & lhs, \ + const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + int mesh_nz = rhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs).setScale(1, mesh_nz), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ + } -#if 0 -//Field2D operator+(BoutReal lhs, const Field2D& rhs); -//Field2D operator-(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Sub>> -operator-(L lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - - return BinaryExpr, R, bout::op::Sub>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Sub{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator*(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Mul>> -operator*(L lhs, const R& rhs) { - //static_assert(always_false || always_false, "Hello"); - - return BinaryExpr, R, bout::op::Mul>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Mul{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -//Field2D operator/(BoutReal lhs, const Field2D& rhs); -template -std::enable_if_t && is_expr_field2d_v, - BinaryExpr, R, bout::op::Div>> -operator/(L lhs, const R& rhs) { - return BinaryExpr, R, bout::op::Div>{ - static_cast::View>(lhs), - static_cast(rhs), - bout::op::Div{}, - rhs.getMesh(), - rhs.getLocation(), - rhs.getDirections(), - std::nullopt, - rhs.getMesh()->getRegion2D("RGN_ALL")}; -} -#endif +FIELD3D_FIELD2D_FIELD3D_OP(+, Add) +FIELD3D_FIELD2D_FIELD3D_OP(-, Sub) +FIELD3D_FIELD2D_FIELD3D_OP(*, Mul) +FIELD3D_FIELD2D_FIELD3D_OP(/, Div) + +#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ + } + +FIELD2D_FIELD2D_BOUTREAL_OP(+, Add) +FIELD2D_FIELD2D_BOUTREAL_OP(-, Sub) +FIELD2D_FIELD2D_BOUTREAL_OP(*, Mul) +FIELD2D_FIELD2D_BOUTREAL_OP(/, Div) #define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 8f433909f9..fac2adc337 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -478,135 +478,26 @@ public: ///@} - /// Addition operators - ///@{ - //Field3D& operator+=(const Field3D& rhs); - template >> - Field3D& operator+=(const R& rhs) { - //printf("RUNNING operator+= with CUDA\n"); - if (data.unique()) { - //std::cout << "RUNNING Field3D operator+= w/ CUDA" << __FILE__ << " " - // << std::to_string(__LINE__) << "\n"; - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) + rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) + rhs; - } - - return *this; - } - Field3D& operator+=(const Field2D& rhs); - Field3D& operator+=(BoutReal rhs); - ///@} - - /// Subtraction operators - ///@{ - //Field3D& operator-=(const Field3D& rhs); - template >> - Field3D& operator-=(const R& rhs) { - if (data.unique()) { - //printf("RUNNING operator-= with CUDA with BE\n"); - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - auto BE = (*this) - rhs; - BE.evaluate(&data[0]); - } else { - //printf("RUNNING operator-= with CUDA with operation\n"); - (*this) = (*this) - rhs; - } - - return *this; - } - Field3D& operator-=(const Field2D& rhs); - Field3D& operator-=(BoutReal rhs); - ///@} - - /// Multiplication operators - ///@{ - //Field3D& operator*=(const Field3D& rhs); - template >> - Field3D& operator*=(const R& rhs) { - //printf("RUNNING operator*= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) * rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - - return *this; +#define FIELD3D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t || is_expr_field2d_v \ + || is_expr_constant_v, \ + Field3D&> operator OP_SYM##=(const R & rhs) { \ + if (data.unique()) { \ + clearParallelSlices(); \ + auto Expr = (*this)OP_SYM rhs; \ + Expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } - Field3D& operator*=(const Field2D& rhs); - //Field3D& operator*=(BoutReal rhs); - // here1 - template >> - Field3D& operator*=(R rhs) { - //printf("RUNNING operator*= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) * rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) * rhs; - } - return *this; - } - ///@} + FIELD3D_OP_EQUALS(+) + FIELD3D_OP_EQUALS(-) + FIELD3D_OP_EQUALS(*) + FIELD3D_OP_EQUALS(/) - /// Division operators - ///@{ - //Field3D& operator/=(const Field3D& rhs); - template - std::enable_if_t,Field3D&> operator/=(const R& rhs) { - //printf("RUNNING operator/= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) / rhs; - regionID = BE.getRegionID(); - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } - //Field3D& operator/=(const Field2D& rhs); - template -std::enable_if_t, Field3D&> operator/=(const R& rhs) { - //printf("RUNNING operator/= with CUDA\n"); - if (data.unique()) { - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - auto BE = (*this) / rhs; - BE.evaluate(&data[0]); - } else { - (*this) = (*this) / rhs; - } - - return *this; - } - Field3D& operator/=(BoutReal rhs); ///@} // FieldData virtual functions @@ -699,11 +590,6 @@ FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) FIELD3D_FIELD3D_FIELD3D_OP(/, Div) -//Field3D operator+(const Field3D& lhs, const Field2D& rhs); -//Field3D operator-(const Field3D& lhs, const Field2D& rhs); -//Field3D operator*(const Field3D& lhs, const Field2D& rhs); -//Field3D operator/(const Field3D& lhs, const Field2D& rhs); - #define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field2d_v, \ @@ -727,11 +613,6 @@ FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) FIELD3D_FIELD3D_FIELD2D_OP(/, Div) -//Field3D operator+(const Field3D& lhs, BoutReal rhs); -//Field3D operator-(const Field3D& lhs, BoutReal rhs); -//Field3D operator*(const Field3D& lhs, BoutReal rhs); -//Field3D operator/(const Field3D& lhs, BoutReal rhs); - #define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_constant_v, \ @@ -754,11 +635,6 @@ FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) -//Field3D operator+(BoutReal lhs, const Field3D& rhs); -//Field3D operator-(BoutReal lhs, const Field3D& rhs); -//Field3D operator*(BoutReal lhs, const Field3D& rhs); -//Field3D operator/(BoutReal lhs, const Field3D& rhs); - #define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ template \ std::enable_if_t && is_expr_field3d_v, \ diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index dca1605b91..c78a9ed7b7 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -6,148 +6,6 @@ #include #include -// Provide the C++ operator to update Field3D by multiplication with Field2D -Field3D& Field3D::operator*=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] *= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) * rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and Field2D -Field3D operator+(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] + rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by addition with Field2D -Field3D& Field3D::operator+=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] += rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for subtraction of Field3D and Field2D -Field3D operator-(const Field3D& lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[base_ind + jz] - rhs[index]; - } - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by subtraction with Field2D -Field3D& Field3D::operator-=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, rhs.getRegion("RGN_ALL")) { - const auto base_ind = fieldmesh->ind2Dto3D(index); - for (int jz = 0; jz < fieldmesh->LocalNz; ++jz) { - (*this)[base_ind + jz] -= rhs[index]; - } - } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - // Provide the C++ wrapper for multiplication of Field3D and FieldPerp FieldPerp operator*(const Field3D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -233,211 +91,6 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { } #if 0 -// Provide the C++ wrapper for division of Field3D and BoutReal -Field3D operator/(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] * tmp; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by division with BoutReal -Field3D& Field3D::operator/=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - const auto tmp = 1.0 / rhs; - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] *= tmp; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for addition of Field3D and BoutReal -Field3D operator+(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by addition with BoutReal -Field3D& Field3D::operator+=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} - -#if 0 -// Provide the C++ wrapper for subtraction of Field3D and BoutReal -Field3D operator-(const Field3D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field3D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(lhs.getRegionID()); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] - rhs; - } - - checkData(result); - return result; -} -#endif - -// Provide the C++ operator to update Field3D by subtraction with BoutReal -Field3D& Field3D::operator-=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - // Delete existing parallel slices. We don't copy parallel slices, so any - // that currently exist will be incorrect. - clearParallelSlices(); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} - -// Provide the C++ wrapper for division of Field2D and Field3D -Field3D operator/(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] / rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for addition of Field2D and Field3D -Field3D operator+(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] + rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - -// Provide the C++ wrapper for subtraction of Field2D and Field3D -Field3D operator-(const Field2D& lhs, const Field3D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - ASSERT1_FIELDS_COMPATIBLE(lhs, rhs); - - Field3D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - result.setRegion(rhs.getRegionID()); - - Mesh* localmesh = lhs.getMesh(); - - BOUT_FOR(index, lhs.getRegion("RGN_ALL")) { - const auto base_ind = localmesh->ind2Dto3D(index); - for (int jz = 0; jz < localmesh->LocalNz; ++jz) { - result[base_ind + jz] = lhs[index] - rhs[base_ind + jz]; - } - } - - checkData(result); - return result; -} - // Provide the C++ operator to update Field2D by division with Field2D Field2D& Field2D::operator/=(const Field2D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -458,6 +111,7 @@ Field2D& Field2D::operator/=(const Field2D& rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { @@ -543,22 +197,7 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -// Provide the C++ wrapper for addition of Field2D and BoutReal -Field2D operator+(const Field2D& lhs, const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(lhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs[index] + rhs; - } - - checkData(result); - return result; -} - +#if 0 // Provide the C++ operator to update Field2D by addition with BoutReal Field2D& Field2D::operator+=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -578,7 +217,9 @@ Field2D& Field2D::operator+=(const BoutReal rhs) { } return *this; } +#endif +#if 0 // Provide the C++ operator to update Field2D by subtraction with BoutReal Field2D& Field2D::operator-=(const BoutReal rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -598,6 +239,7 @@ Field2D& Field2D::operator-=(const BoutReal rhs) { } return *this; } +#endif // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { @@ -1280,24 +922,6 @@ FieldPerp& FieldPerp::operator-=(const BoutReal rhs) { return *this; } -#if 0 -// Provide the C++ wrapper for addition of BoutReal and Field2D -Field2D operator+(const BoutReal lhs, const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - - Field2D result{emptyFrom(rhs)}; - checkData(lhs); - checkData(rhs); - - BOUT_FOR(index, result.getValidRegionWithDefault("RGN_ALL")) { - result[index] = lhs + rhs[index]; - } - - checkData(result); - return result; -} -#endif - // Provide the C++ wrapper for multiplication of BoutReal and FieldPerp FieldPerp operator*(const BoutReal lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; From 56a8678fb275fd6035e99bb1d506f047817a6d8d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Sun, 1 Jun 2025 06:30:58 -0700 Subject: [PATCH 17/29] Cleanup --- src/field/generated_fieldops.cxx | 67 -------------------------------- 1 file changed, 67 deletions(-) diff --git a/src/field/generated_fieldops.cxx b/src/field/generated_fieldops.cxx index c78a9ed7b7..022fedbd17 100644 --- a/src/field/generated_fieldops.cxx +++ b/src/field/generated_fieldops.cxx @@ -90,29 +90,6 @@ FieldPerp operator-(const Field3D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ operator to update Field2D by division with Field2D -Field2D& Field2D::operator/=(const Field2D& rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - ASSERT1_FIELDS_COMPATIBLE(*this, rhs); - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] /= rhs[index]; } - - checkData(*this); - - } else { - (*this) = (*this) / rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of Field2D and FieldPerp FieldPerp operator*(const Field2D& lhs, const FieldPerp& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; @@ -197,50 +174,6 @@ FieldPerp operator-(const Field2D& lhs, const FieldPerp& rhs) { return result; } -#if 0 -// Provide the C++ operator to update Field2D by addition with BoutReal -Field2D& Field2D::operator+=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] += rhs; } - - checkData(*this); - - } else { - (*this) = (*this) + rhs; - } - return *this; -} -#endif - -#if 0 -// Provide the C++ operator to update Field2D by subtraction with BoutReal -Field2D& Field2D::operator-=(const BoutReal rhs) { - std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; - // only if data is unique we update the field - // otherwise just call the non-inplace version - if (data.unique()) { - - checkData(*this); - checkData(rhs); - - BOUT_FOR(index, this->getRegion("RGN_ALL")) { (*this)[index] -= rhs; } - - checkData(*this); - - } else { - (*this) = (*this) - rhs; - } - return *this; -} -#endif - // Provide the C++ wrapper for multiplication of FieldPerp and Field3D FieldPerp operator*(const FieldPerp& lhs, const Field3D& rhs) { std::cout << "RUNNING operator " << __FILE__ << " " << std::to_string(__LINE__) << "\n"; From 3e3a0ea3973f222a31091ae42c1c360ebb34da3f Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 2 Jun 2025 20:09:40 -0700 Subject: [PATCH 18/29] Add __host__ to make evaluator host-callable, remove offset --- include/bout/bout_types.hxx | 2 +- include/bout/field2d.hxx | 7 +++- include/bout/field3d.hxx | 11 ++--- include/bout/fieldops.hxx | 81 +++++++++++++++++++++++++------------ 4 files changed, 64 insertions(+), 37 deletions(-) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index b2f38b61aa..c725c281d3 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -146,7 +146,7 @@ struct Constant { struct View { T v; View(T v) : v(v) {} - __device__ T operator()(int) const { return v; } + __host__ __device__ T operator()(int) const { return v; } }; operator View() const { return {val}; } }; diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index f430b98c6c..88f67f277b 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -108,6 +108,7 @@ public: || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> Field2D(const BinaryExpr& expr) { + std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); *this = std::move(Field2D{std::move(data), expr.getMesh(), expr.getLocation(), @@ -315,8 +316,10 @@ public: BoutReal* data; int mul = 1; int div = 1; - __device__ inline BoutReal operator()(int idx) const { return data[(idx*mul/div)]; } - __device__ inline BoutReal& operator[](int idx) const { + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul / div)]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { return data[(idx * mul)/div]; } diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index fac2adc337..5056a4128b 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -429,12 +429,11 @@ public: BoutReal* data; int mul = 1; int div = 1; - int offset = 0; __host__ __device__ inline BoutReal operator()(int idx) const { - return data[(idx * mul) / div + offset]; + return data[(idx * mul) / div]; } - __device__ inline BoutReal& operator[](int idx) const { - return data[(idx * mul) / div + offset]; + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; } View& setScale(int mul, int div) { @@ -442,10 +441,6 @@ public: this->div = div; return *this; } - View& setOffset(int o) { - offset = o; - return *this; - } }; operator View() { return View{&data[0]}; } operator View() const { return View{const_cast(&data[0])}; } diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 0de73d1205..9346b55e48 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -38,55 +38,58 @@ template struct is_expr_constant> : std::integral_constant>> {}; -constexpr int THREADS = 256; +constexpr int THREADS = 128; namespace bout { namespace op { struct Assign { int scale = 1; int offset = 0; template - __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { + __host__ __device__ void operator()(int idx, BoutReal* out, const Expr& expr) const { out[(idx * scale) + offset] = expr.lhs(idx) + expr.rhs(idx); } }; struct Add { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) + R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { return a + b; } }; struct Sub { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) - R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a - b; } }; struct Mul { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) * R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a * b; } }; struct Div { template - __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, - const RView& R) const { + __host__ __device__ __forceinline__ BoutReal operator()(int idx, const LView& L, + const RView& R) const { return L(idx) / R(idx); } - __device__ __forceinline__ BoutReal operator()(BoutReal a, BoutReal b) const { + __host__ __device__ __forceinline__ BoutReal operator()(BoutReal a, + BoutReal b) const { return a / b; } }; @@ -96,13 +99,24 @@ struct Add { template __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Expr expr) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= expr.size()) { + int e = expr.size(); + + // In-bounds version + //if (tid < e) { + // int idx = expr.regionIdx(tid); + // out[idx] = expr(idx); // single‐pass fusion + //} + + // Out-of-bounds version + if (tid >= e) { return; } int idx = expr.regionIdx(tid); out[idx] = expr(idx); // single‐pass fusion + + // Grid-strided loop //int stride = blockDim.x * gridDim.x; - //for (int i = tid, e = expr.size(); i < e; i += stride) { + //for (int i = tid; i < e; i += stride) { // int idx = expr.regionIdx(i); // out[idx] = expr(idx); // single‐pass fusion //} @@ -133,6 +147,18 @@ struct BinaryExpr { for (int i = 0; i < indices.size(); ++i) { indices[i] = region.getIndices()[i].ind; } + //std::cout << "===PRE-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end PRE\n"; + //std::sort(indices.begin(), indices.end(), + // [](const auto& a, const auto& b) { return a < b; }); + //std::cout << "===POST-sorting indices\n"; + //for (auto& ind : indices) { + // std::cout << ind << " "; + //} + //std::cout << "===end POST\n"; //if (regionIndicesCache.find(static_cast(const_cast*>(®ion))) // != regionIndicesCache.end()) { // // If we have already computed the indices for this region, use them @@ -167,23 +193,19 @@ struct BinaryExpr { Func f; int mul = 1; int div = 1; - int offset = 0; View& setScale(int mul, int div) { this->mul = mul; this->div = div; return *this; } - View& setOffset(int o) { - offset = o; - return *this; + __host__ __device__ __forceinline__ int size() const { return num_indices; } + __host__ __device__ __forceinline__ int regionIdx(int idx) const { + return indices[idx]; } - - __device__ __forceinline__ int size() const { return num_indices; } - __device__ __forceinline__ int regionIdx(int idx) const { return indices[idx]; } - __device__ __forceinline__ BoutReal operator()(int idx) const { - return f((idx * mul) / div, lhs, rhs); // single‐pass fusion - //return f(lhs(idx), rhs(idx)); // single‐pass fusion + __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { + //return f((idx * mul) / div, lhs, rhs); // single‐pass fusion + return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion } }; @@ -194,6 +216,13 @@ struct BinaryExpr { int blocks = (size() + THREADS - 1) / THREADS; evaluatorExpr<<>>(&data[0], static_cast(*this)); cudaDeviceSynchronize(); + // OpenMP impl. + //int e = size(); + //#pragma omp parallel for + //for (int i = 0; i < e; ++i) { + // int idx = regionIdx(i); + // data[idx] = operator()(idx); // single‐pass fusion + //} } Mesh* getMesh() const { return mesh; } From 56fe675a77441ec3f447b54d97ea49f7069e8375 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Wed, 4 Jun 2025 01:18:13 -0700 Subject: [PATCH 19/29] Update - Add field functions (sqrt, abs, etc.) - Add ResT template parameter to BinaryExpr for future use - Update some operators (min, max, etc.) to take as input a BinaryExpr and evaluate it before apply --- .../elm-pb-outerloop/elm_pb_outerloop.cxx | 8 +- include/bout/field.hxx | 52 +++++-- include/bout/field2d.hxx | 127 +++++++++--------- include/bout/field3d.hxx | 110 +++++++-------- include/bout/fieldops.hxx | 13 +- include/bout/fieldperp.hxx | 39 ++++++ src/field/field2d.cxx | 3 +- src/field/fieldperp.cxx | 2 +- src/mesh/coordinates.cxx | 2 +- src/physics/snb.cxx | 4 +- 10 files changed, 219 insertions(+), 141 deletions(-) diff --git a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx index 901e57bf97..96c28bab12 100644 --- a/examples/elm-pb-outerloop/elm_pb_outerloop.cxx +++ b/examples/elm-pb-outerloop/elm_pb_outerloop.cxx @@ -713,7 +713,7 @@ class ELMpb : public PhysicsModel { diamag_phi0 = false; K_H_term = false; } else { - Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(Field2D{D_s * (x - x0)})); + Dphi0 = -D_min - 0.5 * D_0 * (1.0 - tanh(D_s * (x - x0))); } if (sign < 0) { // change flow direction @@ -1031,7 +1031,7 @@ class ELMpb : public PhysicsModel { vacuum_trans *= pnorm; // Transitions from 0 in core to 1 in vacuum - Field2D tanh_res = tanh(Field2D{(P0 - vacuum_pressure) / vacuum_trans}); + Field2D tanh_res = tanh((P0 - vacuum_pressure) / vacuum_trans); vac_mask = (1.0 - tanh_res) / 2.0; if (spitzer_resist) { @@ -1283,7 +1283,7 @@ class ELMpb : public PhysicsModel { //////////////////////////////////////////// // Transitions from 0 in core to 1 in vacuum if (nonlinear) { - vac_mask = (1.0 - tanh(Field3D{((P0 + P) - vacuum_pressure) / vacuum_trans})) / 2.0; + vac_mask = (1.0 - tanh(((P0 + P) - vacuum_pressure) / vacuum_trans)) / 2.0; // Update resistivity if (spitzer_resist) { @@ -1794,7 +1794,7 @@ class ELMpb : public PhysicsModel { // Calculate coefficient. hyper_mu_x = hyperviscos * metric->g_11 * SQ(metric->dx) - * abs(Field3D{metric->g11 * D2DX2(U)}) / (abs(U) + 1e-3); + * abs(metric->g11 * D2DX2(U)) / (abs(U) + 1e-3); hyper_mu_x.applyBoundary("dirichlet"); // Set to zero on all boundaries ddt(U) += hyper_mu_x * metric->g11 * D2DX2(U); diff --git a/include/bout/field.hxx b/include/bout/field.hxx index 188d529ef0..fe2b4767d2 100644 --- a/include/bout/field.hxx +++ b/include/bout/field.hxx @@ -44,6 +44,8 @@ class Field; #include #include +#include "bout/fieldops.hxx" + class Mesh; /// Base class for scalar fields @@ -327,6 +329,12 @@ inline BoutReal min(const T& f, bool allpe = false, return result; } +template +inline BoutReal min(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return min(ResT{f}, allpe, rgn); +} + /// Returns true if all elements of \p f over \p region are equal. By /// default only checks the local processor, use \p allpe to check /// globally @@ -412,6 +420,12 @@ inline BoutReal max(const T& f, bool allpe = false, return result; } +template +inline BoutReal max(const BinaryExpr& f, bool allpe = false, + const std::string& rgn = "RGN_NOBNDRY") { + return max(ResT{f}, allpe, rgn); +} + /// Mean of \p f, excluding the boundary/guard cells by default (can /// be changed with \p rgn argument). /// @@ -519,17 +533,33 @@ T pow(BoutReal lhs, const T& rhs, const std::string& rgn = "RGN_ALL") { #ifdef FIELD_FUNC #error This macro has already been defined #else -#define FIELD_FUNC(name, func) \ - template > \ - inline T name(const T& f, const std::string& rgn = "RGN_ALL") { \ - AUTO_TRACE(); \ - /* Check if the input is allocated */ \ - checkData(f); \ - /* Define and allocate the output result */ \ - T result{emptyFrom(f)}; \ - BOUT_FOR(d, result.getRegion(rgn)) { result[d] = func(f[d]); } \ - checkData(result); \ - return result; \ +#define FIELD_FUNC(name, func) \ + namespace bout::op { \ + struct name { \ + template \ + __host__ __device__ BoutReal operator()(int idx, const LView& L, \ + const RView& R) const { \ + return func(L(idx)); \ + } \ + }; \ + }; \ + template > \ + inline BinaryExpr name(const T& f, \ + const std::string& rgn = "RGN_ALL") { \ + std::cout << "RUNNING " #name " with CUDA\n"; \ + return BinaryExpr{static_cast(f), \ + static_cast(f), \ + bout::op::name{}, \ + f.getMesh(), \ + f.getLocation(), \ + f.getDirections(), \ + std::nullopt, \ + f.getRegion(rgn)}; \ + } \ + template \ + inline BinaryExpr name( \ + const BinaryExpr& f, const std::string& rgn = "RGN_ALL") { \ + return name(ResT{f}, rgn); \ } #endif diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index 88f67f277b..da8de551ad 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -47,8 +47,8 @@ class Field2D; class Field3D; class Mesh; -template -struct is_expr_field2d> +template +struct is_expr_field2d> : std::integral_constant> && is_expr_field2d_v>) || (is_expr_constant_v> @@ -103,11 +103,11 @@ public: ZDirectionType::Average}); template < - typename L, typename R, typename Func, + typename ResT, typename L, typename R, typename Func, typename = std::enable_if_t<(is_expr_field2d_v && is_expr_field2d_v) || (is_expr_constant_v && is_expr_field2d_v) || (is_expr_field2d_v && is_expr_constant_v)>> - Field2D(const BinaryExpr& expr) { + Field2D(const BinaryExpr& expr) { std::cout << "RUNNING Field2D constructor with CUDA\n"; Array data{expr.size()}; expr.evaluate(&data[0]); @@ -189,9 +189,9 @@ public: */ Field2D& operator=(BoutReal rhs); - template + template std::enable_if_t, Field2D&> - operator=(const BinaryExpr& expr) { + operator=(const BinaryExpr& expr) { std::cout << "RUNNING Field2D operator= with CUDA\n"; if (isAllocated()) { expr.evaluate(&data[0]); @@ -348,19 +348,20 @@ private: // Non-member overloaded operators -#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - return BinaryExpr{static_cast(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - std::nullopt, \ - lhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_FIELD2D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_FIELD2D_FIELD2D_OP(+, Add) @@ -368,22 +369,22 @@ FIELD2D_FIELD2D_FIELD2D_OP(-, Sub) FIELD2D_FIELD2D_FIELD2D_OP(*, Mul) FIELD2D_FIELD2D_FIELD2D_OP(/, Div) -#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - auto regionID = rhs.getRegionID(); \ - int mesh_nz = rhs.getMesh()->LocalNz; \ - return BinaryExpr{ \ - static_cast(lhs).setScale(1, mesh_nz), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - regionID, \ - rhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD2D_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + int mesh_nz = rhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs).setScale(1, mesh_nz), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD2D_FIELD3D_OP(+, Add) @@ -391,20 +392,20 @@ FIELD3D_FIELD2D_FIELD3D_OP(-, Sub) FIELD3D_FIELD2D_FIELD3D_OP(*, Mul) FIELD3D_FIELD2D_FIELD3D_OP(/, Div) -#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_constant_v, \ - BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ - return BinaryExpr, bout::op::OP_TYPE>{ \ - static_cast(lhs), \ - static_cast::View>(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - std::nullopt, \ - lhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_FIELD2D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + std::nullopt, \ + lhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_FIELD2D_BOUTREAL_OP(+, Add) @@ -412,20 +413,20 @@ FIELD2D_FIELD2D_BOUTREAL_OP(-, Sub) FIELD2D_FIELD2D_BOUTREAL_OP(*, Mul) FIELD2D_FIELD2D_BOUTREAL_OP(/, Div) -#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(L lhs, const R & rhs) { \ - return BinaryExpr, R, bout::op::OP_TYPE>{ \ - static_cast::View>(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - std::nullopt, \ - rhs.getMesh()->getRegion2D("RGN_ALL")}; \ +#define FIELD2D_BOUTREAL_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(L lhs, const R & rhs) { \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + std::nullopt, \ + rhs.getMesh()->getRegion2D("RGN_ALL")}; \ } FIELD2D_BOUTREAL_FIELD2D_OP(+, Add) diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 5056a4128b..62b299bc48 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -187,7 +187,7 @@ public: ZDirectionType::Standard}); template || is_expr_field3d_v>> - Field3D(const BinaryExpr& expr) { + Field3D(const BinaryExpr& expr) { //std::cout << "RUNNING constructor from BinaryExpr\n"; Array data{expr.size()}; expr.evaluate(&data[0]); @@ -457,9 +457,9 @@ public: /// return void, as only part initialised void operator=(const FieldPerp& rhs); Field3D& operator=(BoutReal val); - template + template std::enable_if_t, Field3D&> - operator=(BinaryExpr& expr) { + operator=(BinaryExpr& expr) { std::cout << "RUNNING operator= with CUDA\n"; regionID = expr.getRegionID(); if(isAllocated()) { @@ -565,10 +565,11 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); #define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ template && is_expr_field3d_v>> \ - BinaryExpr operator OP_SYM(const L & lhs, const R & rhs) { \ + BinaryExpr operator OP_SYM(const L & lhs, \ + const R & rhs) { \ auto regionID = \ lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ - return BinaryExpr{ \ + return BinaryExpr{ \ static_cast(lhs), \ static_cast(rhs), \ bout::op::OP_TYPE{}, \ @@ -585,22 +586,22 @@ FIELD3D_FIELD3D_FIELD3D_OP(-, Sub) FIELD3D_FIELD3D_FIELD3D_OP(*, Mul) FIELD3D_FIELD3D_FIELD3D_OP(/, Div) -#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field2d_v, \ - BinaryExpr> operator OP_SYM(const L & lhs, \ - const R & rhs) { \ - auto regionID = lhs.getRegionID(); \ - int mesh_nz = lhs.getMesh()->LocalNz; \ - return BinaryExpr{ \ - static_cast(lhs), \ - static_cast(rhs).setScale(1, mesh_nz), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - regionID, \ - lhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD3D_FIELD2D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field2d_v, \ + BinaryExpr> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = lhs.getRegionID(); \ + int mesh_nz = lhs.getMesh()->LocalNz; \ + return BinaryExpr{ \ + static_cast(lhs), \ + static_cast(rhs).setScale(1, mesh_nz), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD3D_FIELD2D_OP(+, Add) @@ -608,21 +609,21 @@ FIELD3D_FIELD3D_FIELD2D_OP(-, Sub) FIELD3D_FIELD3D_FIELD2D_OP(*, Mul) FIELD3D_FIELD3D_FIELD2D_OP(/, Div) -#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_constant_v, \ - BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ - auto regionID = lhs.getRegionID(); \ - return BinaryExpr, bout::op::OP_TYPE>{ \ - static_cast(lhs), \ - static_cast::View>(rhs), \ - bout::op::OP_TYPE{}, \ - lhs.getMesh(), \ - lhs.getLocation(), \ - lhs.getDirections(), \ - regionID, \ - lhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_FIELD3D_BOUTREAL_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_constant_v, \ + BinaryExpr, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, R rhs) { \ + auto regionID = lhs.getRegionID(); \ + return BinaryExpr, bout::op::OP_TYPE>{ \ + static_cast(lhs), \ + static_cast::View>(rhs), \ + bout::op::OP_TYPE{}, \ + lhs.getMesh(), \ + lhs.getLocation(), \ + lhs.getDirections(), \ + regionID, \ + lhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_FIELD3D_BOUTREAL_OP(+, Add) @@ -630,21 +631,21 @@ FIELD3D_FIELD3D_BOUTREAL_OP(-, Sub) FIELD3D_FIELD3D_BOUTREAL_OP(*, Mul) FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) -#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ - template \ - std::enable_if_t && is_expr_field3d_v, \ - BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ - auto regionID = rhs.getRegionID(); \ - return BinaryExpr, R, bout::op::OP_TYPE>{ \ - static_cast::View>(lhs), \ - static_cast(rhs), \ - bout::op::OP_TYPE{}, \ - rhs.getMesh(), \ - rhs.getLocation(), \ - rhs.getDirections(), \ - regionID, \ - rhs.getMesh()->getRegion("RGN_ALL")}; \ +#define FIELD3D_BOUTREAL_FIELD3D_OP(OP_SYM, OP_TYPE) \ + template \ + std::enable_if_t && is_expr_field3d_v, \ + BinaryExpr, R, bout::op::OP_TYPE>> \ + operator OP_SYM(const L & lhs, const R & rhs) { \ + auto regionID = rhs.getRegionID(); \ + return BinaryExpr, R, bout::op::OP_TYPE>{ \ + static_cast::View>(lhs), \ + static_cast(rhs), \ + bout::op::OP_TYPE{}, \ + rhs.getMesh(), \ + rhs.getLocation(), \ + rhs.getDirections(), \ + regionID, \ + rhs.getMesh()->getRegion("RGN_ALL")}; \ } FIELD3D_BOUTREAL_FIELD3D_OP(+, Add) @@ -771,8 +772,9 @@ struct is_expr_field3d : std::true_type {}; template <> struct is_expr_field2d : std::true_type {}; -template -struct is_expr_field3d> - : std::integral_constant>::value || is_expr_field3d_v>> {}; +template +struct is_expr_field3d> + : std::integral_constant>::value + || is_expr_field3d_v>> {}; #endif /* BOUT_FIELD3D_H */ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 9346b55e48..48d104e3ea 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -24,6 +24,12 @@ inline constexpr bool is_expr_field2d_v = is_expr_field2d>::valu template struct is_expr_field3d : std::false_type {}; +template +struct is_expr_fieldperp : std::false_type {}; + +template +inline constexpr bool is_expr_fieldperp_v = is_expr_fieldperp>::value; + // Helper variable template template inline constexpr bool is_expr_field3d_v = is_expr_field3d>::value; @@ -124,7 +130,7 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex inline std::unordered_map> regionIndicesCache; -template +template struct BinaryExpr { typename L::View lhs; typename R::View rhs; @@ -185,6 +191,7 @@ struct BinaryExpr { } inline int regionIdx(int idx) const { return indices[idx]; } + //operator ResT() { return ResT{*this}; } struct View { typename L::View lhs; typename R::View rhs; @@ -204,8 +211,8 @@ struct BinaryExpr { return indices[idx]; } __host__ __device__ __forceinline__ BoutReal operator()(int idx) const { - //return f((idx * mul) / div, lhs, rhs); // single‐pass fusion - return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion + return f((idx * mul) / div, lhs, rhs); // single‐pass fusion + //return f(lhs((idx * mul) / div), rhs((idx * mul) / div)); // single‐pass fusion } }; diff --git a/include/bout/fieldperp.hxx b/include/bout/fieldperp.hxx index 6995308dbe..49d4fec1b7 100644 --- a/include/bout/fieldperp.hxx +++ b/include/bout/fieldperp.hxx @@ -86,6 +86,17 @@ public: DirectionTypes directions_in = {YDirectionType::Standard, ZDirectionType::Standard}); + template < + typename ResT, typename L, typename R, typename Func, + typename = std::enable_if_t<(is_expr_fieldperp_v && is_expr_fieldperp_v)>> + FieldPerp(const BinaryExpr& expr) { + std::cout << "RUNNING FieldPerp constructor with CUDA\n"; + Array data{expr.size()}; + expr.evaluate(&data[0]); + *this = std::move(FieldPerp{std::move(data), expr.getMesh(), expr.getLocation(), + /* yindex */ -1, expr.getDirections()}); + } + ~FieldPerp() override = default; /*! @@ -292,6 +303,26 @@ public: int size() const override { return nx * nz; }; + struct View { + BoutReal* data; + int mul = 1; + int div = 1; + __host__ __device__ inline BoutReal operator()(int idx) const { + return data[(idx * mul) / div]; + } + __host__ __device__ inline BoutReal& operator[](int idx) const { + return data[(idx * mul) / div]; + } + + View& setScale(int mul, int div) { + this->mul = mul; + this->div = div; + return *this; + } + }; + operator View() { return View{&data[0]}; } + operator View() const { return View{const_cast(&data[0])}; } + private: /// The Y index at which this FieldPerp is defined int yindex{-1}; @@ -379,4 +410,12 @@ bool operator==(const FieldPerp& a, const FieldPerp& b); /// Output a string describing a FieldPerp to a stream std::ostream& operator<<(std::ostream& out, const FieldPerp& value); +template <> +struct is_expr_fieldperp : std::true_type {}; + +template +struct is_expr_fieldperp> + : std::integral_constant> + && is_expr_fieldperp_v>> {}; + #endif diff --git a/src/field/field2d.cxx b/src/field/field2d.cxx index e5c1d466b7..c8b9ebb689 100644 --- a/src/field/field2d.cxx +++ b/src/field/field2d.cxx @@ -389,8 +389,7 @@ bool operator==(const Field2D& a, const Field2D& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - Field2D diff = a - b; - return min(abs(diff)) < 1e-10; + return min(abs(a - b)) < 1e-10; } std::ostream& operator<<(std::ostream& out, const Field2D& value) { diff --git a/src/field/fieldperp.cxx b/src/field/fieldperp.cxx index ca9bdc0397..9578aa0d9d 100644 --- a/src/field/fieldperp.cxx +++ b/src/field/fieldperp.cxx @@ -209,7 +209,7 @@ bool operator==(const FieldPerp& a, const FieldPerp& b) { if (!a.isAllocated() || !b.isAllocated()) { return false; } - return (a.getIndex() == b.getIndex()) and (min(abs(a - b)) < 1e-10); + return (a.getIndex() == b.getIndex()) and (min(FieldPerp{abs(a - b)}) < 1e-10); } std::ostream& operator<<(std::ostream& out, const FieldPerp& value) { diff --git a/src/mesh/coordinates.cxx b/src/mesh/coordinates.cxx index b8fd33c019..0139d21fd7 100644 --- a/src/mesh/coordinates.cxx +++ b/src/mesh/coordinates.cxx @@ -1349,7 +1349,7 @@ int Coordinates::jacobian() { // Check that g is positive bout::checkPositive(g, "The determinant of g^ij", "RGN_NOBNDRY"); - J = 1. / sqrt(Field2D{g}); + J = 1. / sqrt(g); // More robust to extrapolate derived quantities directly, rather than // deriving from extrapolated covariant metric components J = interpolateAndExtrapolate(J, location, extrapolate_x, extrapolate_y, false, diff --git a/src/physics/snb.cxx b/src/physics/snb.cxx index f21bfb7ee0..80da9e1bf8 100644 --- a/src/physics/snb.cxx +++ b/src/physics/snb.cxx @@ -11,10 +11,10 @@ namespace bout { Field3D HeatFluxSNB::divHeatFlux(const Field3D& Te, const Field3D& Ne, Field3D* Div_Q_SH_out) { - Field3D thermal_speed = sqrt(Field3D{2. * SI::qe * Te / SI::Me}); + Field3D thermal_speed = sqrt(2. * SI::qe * Te / SI::Me); BoutReal Y = SQ(SQ(SI::qe) / (SI::e0 * SI::Me)) / (4 * PI); - Field3D coulomb_log = 6.6 - 0.5 * log(Field3D{Ne * 1e-20}) + 1.5 * log(Te); + Field3D coulomb_log = 6.6 - 0.5 * log(Ne * 1e-20) + 1.5 * log(Te); // Thermal electron-electron mean free path [m] Field3D lambda_ee_T = pow(thermal_speed, 4) / (Y * Ne * coulomb_log); From b75103dcaf716636987e711dd78defa78a544ddb Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 03:43:11 -0700 Subject: [PATCH 20/29] Add FFT GPU shiftZ --- include/bout/twiddle.hxx | 2046 +++++++++++++++++++++++++++ src/mesh/parallel/shiftedmetric.cxx | 325 ++++- 2 files changed, 2367 insertions(+), 4 deletions(-) create mode 100644 include/bout/twiddle.hxx diff --git a/include/bout/twiddle.hxx b/include/bout/twiddle.hxx new file mode 100644 index 0000000000..ae4f729b48 --- /dev/null +++ b/include/bout/twiddle.hxx @@ -0,0 +1,2046 @@ +__constant__ double2 c_twiddle_fwd_16[16] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9238795325112867, -0.3826834323650898}, // k=1 + {0.7071067811865476, -0.7071067811865475}, // k=2 + {0.3826834323650898, -0.9238795325112867}, // k=3 + {0.0000000000000001, -1.0000000000000000}, // k=4 + {-0.3826834323650897, -0.9238795325112867}, // k=5 + {-0.7071067811865475, -0.7071067811865476}, // k=6 + {-0.9238795325112867, -0.3826834323650899}, // k=7 + {-1.0000000000000000, -0.0000000000000001}, // k=8 + {-0.9238795325112868, 0.3826834323650897}, // k=9 + {-0.7071067811865477, 0.7071067811865475}, // k=10 + {-0.3826834323650903, 0.9238795325112865}, // k=11 + {-0.0000000000000002, 1.0000000000000000}, // k=12 + {0.3826834323650900, 0.9238795325112866}, // k=13 + {0.7071067811865474, 0.7071067811865477}, // k=14 + {0.9238795325112865, 0.3826834323650904}, // k=15 +}; + +__constant__ double2 c_twiddle_inv_16[16] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9238795325112867, 0.3826834323650898}, // k=1 + {0.7071067811865476, 0.7071067811865475}, // k=2 + {0.3826834323650898, 0.9238795325112867}, // k=3 + {0.0000000000000001, 1.0000000000000000}, // k=4 + {-0.3826834323650897, 0.9238795325112867}, // k=5 + {-0.7071067811865475, 0.7071067811865476}, // k=6 + {-0.9238795325112867, 0.3826834323650899}, // k=7 + {-1.0000000000000000, 0.0000000000000001}, // k=8 + {-0.9238795325112868, -0.3826834323650897}, // k=9 + {-0.7071067811865477, -0.7071067811865475}, // k=10 + {-0.3826834323650903, -0.9238795325112865}, // k=11 + {-0.0000000000000002, -1.0000000000000000}, // k=12 + {0.3826834323650900, -0.9238795325112866}, // k=13 + {0.7071067811865474, -0.7071067811865477}, // k=14 + {0.9238795325112865, -0.3826834323650904}, // k=15 +}; +__constant__ double2 c_twiddle_fwd_32[32] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9807852804032304, -0.1950903220161282}, // k=1 + {0.9238795325112867, -0.3826834323650898}, // k=2 + {0.8314696123025452, -0.5555702330196022}, // k=3 + {0.7071067811865476, -0.7071067811865475}, // k=4 + {0.5555702330196023, -0.8314696123025452}, // k=5 + {0.3826834323650898, -0.9238795325112867}, // k=6 + {0.1950903220161283, -0.9807852804032304}, // k=7 + {0.0000000000000001, -1.0000000000000000}, // k=8 + {-0.1950903220161282, -0.9807852804032304}, // k=9 + {-0.3826834323650897, -0.9238795325112867}, // k=10 + {-0.5555702330196020, -0.8314696123025455}, // k=11 + {-0.7071067811865475, -0.7071067811865476}, // k=12 + {-0.8314696123025453, -0.5555702330196022}, // k=13 + {-0.9238795325112867, -0.3826834323650899}, // k=14 + {-0.9807852804032304, -0.1950903220161286}, // k=15 + {-1.0000000000000000, -0.0000000000000001}, // k=16 + {-0.9807852804032304, 0.1950903220161284}, // k=17 + {-0.9238795325112868, 0.3826834323650897}, // k=18 + {-0.8314696123025455, 0.5555702330196020}, // k=19 + {-0.7071067811865477, 0.7071067811865475}, // k=20 + {-0.5555702330196022, 0.8314696123025452}, // k=21 + {-0.3826834323650903, 0.9238795325112865}, // k=22 + {-0.1950903220161287, 0.9807852804032303}, // k=23 + {-0.0000000000000002, 1.0000000000000000}, // k=24 + {0.1950903220161283, 0.9807852804032304}, // k=25 + {0.3826834323650900, 0.9238795325112866}, // k=26 + {0.5555702330196018, 0.8314696123025455}, // k=27 + {0.7071067811865474, 0.7071067811865477}, // k=28 + {0.8314696123025452, 0.5555702330196022}, // k=29 + {0.9238795325112865, 0.3826834323650904}, // k=30 + {0.9807852804032303, 0.1950903220161287}, // k=31 +}; + +__constant__ double2 c_twiddle_inv_32[32] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9807852804032304, 0.1950903220161282}, // k=1 + {0.9238795325112867, 0.3826834323650898}, // k=2 + {0.8314696123025452, 0.5555702330196022}, // k=3 + {0.7071067811865476, 0.7071067811865475}, // k=4 + {0.5555702330196023, 0.8314696123025452}, // k=5 + {0.3826834323650898, 0.9238795325112867}, // k=6 + {0.1950903220161283, 0.9807852804032304}, // k=7 + {0.0000000000000001, 1.0000000000000000}, // k=8 + {-0.1950903220161282, 0.9807852804032304}, // k=9 + {-0.3826834323650897, 0.9238795325112867}, // k=10 + {-0.5555702330196020, 0.8314696123025455}, // k=11 + {-0.7071067811865475, 0.7071067811865476}, // k=12 + {-0.8314696123025453, 0.5555702330196022}, // k=13 + {-0.9238795325112867, 0.3826834323650899}, // k=14 + {-0.9807852804032304, 0.1950903220161286}, // k=15 + {-1.0000000000000000, 0.0000000000000001}, // k=16 + {-0.9807852804032304, -0.1950903220161284}, // k=17 + {-0.9238795325112868, -0.3826834323650897}, // k=18 + {-0.8314696123025455, -0.5555702330196020}, // k=19 + {-0.7071067811865477, -0.7071067811865475}, // k=20 + {-0.5555702330196022, -0.8314696123025452}, // k=21 + {-0.3826834323650903, -0.9238795325112865}, // k=22 + {-0.1950903220161287, -0.9807852804032303}, // k=23 + {-0.0000000000000002, -1.0000000000000000}, // k=24 + {0.1950903220161283, -0.9807852804032304}, // k=25 + {0.3826834323650900, -0.9238795325112866}, // k=26 + {0.5555702330196018, -0.8314696123025455}, // k=27 + {0.7071067811865474, -0.7071067811865477}, // k=28 + {0.8314696123025452, -0.5555702330196022}, // k=29 + {0.9238795325112865, -0.3826834323650904}, // k=30 + {0.9807852804032303, -0.1950903220161287}, // k=31 +}; +__constant__ double2 c_twiddle_fwd_64[64] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9951847266721969, -0.0980171403295606}, // k=1 + {0.9807852804032304, -0.1950903220161282}, // k=2 + {0.9569403357322088, -0.2902846772544623}, // k=3 + {0.9238795325112867, -0.3826834323650898}, // k=4 + {0.8819212643483550, -0.4713967368259976}, // k=5 + {0.8314696123025452, -0.5555702330196022}, // k=6 + {0.7730104533627370, -0.6343932841636455}, // k=7 + {0.7071067811865476, -0.7071067811865475}, // k=8 + {0.6343932841636455, -0.7730104533627370}, // k=9 + {0.5555702330196023, -0.8314696123025452}, // k=10 + {0.4713967368259978, -0.8819212643483549}, // k=11 + {0.3826834323650898, -0.9238795325112867}, // k=12 + {0.2902846772544623, -0.9569403357322089}, // k=13 + {0.1950903220161283, -0.9807852804032304}, // k=14 + {0.0980171403295608, -0.9951847266721968}, // k=15 + {0.0000000000000001, -1.0000000000000000}, // k=16 + {-0.0980171403295606, -0.9951847266721969}, // k=17 + {-0.1950903220161282, -0.9807852804032304}, // k=18 + {-0.2902846772544622, -0.9569403357322089}, // k=19 + {-0.3826834323650897, -0.9238795325112867}, // k=20 + {-0.4713967368259977, -0.8819212643483550}, // k=21 + {-0.5555702330196020, -0.8314696123025455}, // k=22 + {-0.6343932841636454, -0.7730104533627371}, // k=23 + {-0.7071067811865475, -0.7071067811865476}, // k=24 + {-0.7730104533627370, -0.6343932841636455}, // k=25 + {-0.8314696123025453, -0.5555702330196022}, // k=26 + {-0.8819212643483549, -0.4713967368259979}, // k=27 + {-0.9238795325112867, -0.3826834323650899}, // k=28 + {-0.9569403357322088, -0.2902846772544624}, // k=29 + {-0.9807852804032304, -0.1950903220161286}, // k=30 + {-0.9951847266721968, -0.0980171403295608}, // k=31 + {-1.0000000000000000, -0.0000000000000001}, // k=32 + {-0.9951847266721969, 0.0980171403295606}, // k=33 + {-0.9807852804032304, 0.1950903220161284}, // k=34 + {-0.9569403357322089, 0.2902846772544621}, // k=35 + {-0.9238795325112868, 0.3826834323650897}, // k=36 + {-0.8819212643483550, 0.4713967368259976}, // k=37 + {-0.8314696123025455, 0.5555702330196020}, // k=38 + {-0.7730104533627371, 0.6343932841636453}, // k=39 + {-0.7071067811865477, 0.7071067811865475}, // k=40 + {-0.6343932841636459, 0.7730104533627367}, // k=41 + {-0.5555702330196022, 0.8314696123025452}, // k=42 + {-0.4713967368259979, 0.8819212643483549}, // k=43 + {-0.3826834323650903, 0.9238795325112865}, // k=44 + {-0.2902846772544624, 0.9569403357322088}, // k=45 + {-0.1950903220161287, 0.9807852804032303}, // k=46 + {-0.0980171403295605, 0.9951847266721969}, // k=47 + {-0.0000000000000002, 1.0000000000000000}, // k=48 + {0.0980171403295601, 0.9951847266721969}, // k=49 + {0.1950903220161283, 0.9807852804032304}, // k=50 + {0.2902846772544621, 0.9569403357322089}, // k=51 + {0.3826834323650900, 0.9238795325112866}, // k=52 + {0.4713967368259976, 0.8819212643483550}, // k=53 + {0.5555702330196018, 0.8314696123025455}, // k=54 + {0.6343932841636456, 0.7730104533627369}, // k=55 + {0.7071067811865474, 0.7071067811865477}, // k=56 + {0.7730104533627367, 0.6343932841636459}, // k=57 + {0.8314696123025452, 0.5555702330196022}, // k=58 + {0.8819212643483548, 0.4713967368259979}, // k=59 + {0.9238795325112865, 0.3826834323650904}, // k=60 + {0.9569403357322088, 0.2902846772544625}, // k=61 + {0.9807852804032303, 0.1950903220161287}, // k=62 + {0.9951847266721969, 0.0980171403295605}, // k=63 +}; + +__constant__ double2 c_twiddle_inv_64[64] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9951847266721969, 0.0980171403295606}, // k=1 + {0.9807852804032304, 0.1950903220161282}, // k=2 + {0.9569403357322088, 0.2902846772544623}, // k=3 + {0.9238795325112867, 0.3826834323650898}, // k=4 + {0.8819212643483550, 0.4713967368259976}, // k=5 + {0.8314696123025452, 0.5555702330196022}, // k=6 + {0.7730104533627370, 0.6343932841636455}, // k=7 + {0.7071067811865476, 0.7071067811865475}, // k=8 + {0.6343932841636455, 0.7730104533627370}, // k=9 + {0.5555702330196023, 0.8314696123025452}, // k=10 + {0.4713967368259978, 0.8819212643483549}, // k=11 + {0.3826834323650898, 0.9238795325112867}, // k=12 + {0.2902846772544623, 0.9569403357322089}, // k=13 + {0.1950903220161283, 0.9807852804032304}, // k=14 + {0.0980171403295608, 0.9951847266721968}, // k=15 + {0.0000000000000001, 1.0000000000000000}, // k=16 + {-0.0980171403295606, 0.9951847266721969}, // k=17 + {-0.1950903220161282, 0.9807852804032304}, // k=18 + {-0.2902846772544622, 0.9569403357322089}, // k=19 + {-0.3826834323650897, 0.9238795325112867}, // k=20 + {-0.4713967368259977, 0.8819212643483550}, // k=21 + {-0.5555702330196020, 0.8314696123025455}, // k=22 + {-0.6343932841636454, 0.7730104533627371}, // k=23 + {-0.7071067811865475, 0.7071067811865476}, // k=24 + {-0.7730104533627370, 0.6343932841636455}, // k=25 + {-0.8314696123025453, 0.5555702330196022}, // k=26 + {-0.8819212643483549, 0.4713967368259979}, // k=27 + {-0.9238795325112867, 0.3826834323650899}, // k=28 + {-0.9569403357322088, 0.2902846772544624}, // k=29 + {-0.9807852804032304, 0.1950903220161286}, // k=30 + {-0.9951847266721968, 0.0980171403295608}, // k=31 + {-1.0000000000000000, 0.0000000000000001}, // k=32 + {-0.9951847266721969, -0.0980171403295606}, // k=33 + {-0.9807852804032304, -0.1950903220161284}, // k=34 + {-0.9569403357322089, -0.2902846772544621}, // k=35 + {-0.9238795325112868, -0.3826834323650897}, // k=36 + {-0.8819212643483550, -0.4713967368259976}, // k=37 + {-0.8314696123025455, -0.5555702330196020}, // k=38 + {-0.7730104533627371, -0.6343932841636453}, // k=39 + {-0.7071067811865477, -0.7071067811865475}, // k=40 + {-0.6343932841636459, -0.7730104533627367}, // k=41 + {-0.5555702330196022, -0.8314696123025452}, // k=42 + {-0.4713967368259979, -0.8819212643483549}, // k=43 + {-0.3826834323650903, -0.9238795325112865}, // k=44 + {-0.2902846772544624, -0.9569403357322088}, // k=45 + {-0.1950903220161287, -0.9807852804032303}, // k=46 + {-0.0980171403295605, -0.9951847266721969}, // k=47 + {-0.0000000000000002, -1.0000000000000000}, // k=48 + {0.0980171403295601, -0.9951847266721969}, // k=49 + {0.1950903220161283, -0.9807852804032304}, // k=50 + {0.2902846772544621, -0.9569403357322089}, // k=51 + {0.3826834323650900, -0.9238795325112866}, // k=52 + {0.4713967368259976, -0.8819212643483550}, // k=53 + {0.5555702330196018, -0.8314696123025455}, // k=54 + {0.6343932841636456, -0.7730104533627369}, // k=55 + {0.7071067811865474, -0.7071067811865477}, // k=56 + {0.7730104533627367, -0.6343932841636459}, // k=57 + {0.8314696123025452, -0.5555702330196022}, // k=58 + {0.8819212643483548, -0.4713967368259979}, // k=59 + {0.9238795325112865, -0.3826834323650904}, // k=60 + {0.9569403357322088, -0.2902846772544625}, // k=61 + {0.9807852804032303, -0.1950903220161287}, // k=62 + {0.9951847266721969, -0.0980171403295605}, // k=63 +}; +__constant__ double2 c_twiddle_fwd_128[128] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9987954562051724, -0.0490676743274180}, // k=1 + {0.9951847266721969, -0.0980171403295606}, // k=2 + {0.9891765099647810, -0.1467304744553617}, // k=3 + {0.9807852804032304, -0.1950903220161282}, // k=4 + {0.9700312531945440, -0.2429801799032639}, // k=5 + {0.9569403357322088, -0.2902846772544623}, // k=6 + {0.9415440651830208, -0.3368898533922201}, // k=7 + {0.9238795325112867, -0.3826834323650898}, // k=8 + {0.9039892931234433, -0.4275550934302821}, // k=9 + {0.8819212643483550, -0.4713967368259976}, // k=10 + {0.8577286100002721, -0.5141027441932217}, // k=11 + {0.8314696123025452, -0.5555702330196022}, // k=12 + {0.8032075314806449, -0.5956993044924334}, // k=13 + {0.7730104533627370, -0.6343932841636455}, // k=14 + {0.7409511253549591, -0.6715589548470183}, // k=15 + {0.7071067811865476, -0.7071067811865475}, // k=16 + {0.6715589548470183, -0.7409511253549591}, // k=17 + {0.6343932841636455, -0.7730104533627370}, // k=18 + {0.5956993044924335, -0.8032075314806448}, // k=19 + {0.5555702330196023, -0.8314696123025452}, // k=20 + {0.5141027441932217, -0.8577286100002721}, // k=21 + {0.4713967368259978, -0.8819212643483549}, // k=22 + {0.4275550934302822, -0.9039892931234433}, // k=23 + {0.3826834323650898, -0.9238795325112867}, // k=24 + {0.3368898533922201, -0.9415440651830208}, // k=25 + {0.2902846772544623, -0.9569403357322089}, // k=26 + {0.2429801799032640, -0.9700312531945440}, // k=27 + {0.1950903220161283, -0.9807852804032304}, // k=28 + {0.1467304744553617, -0.9891765099647810}, // k=29 + {0.0980171403295608, -0.9951847266721968}, // k=30 + {0.0490676743274181, -0.9987954562051724}, // k=31 + {0.0000000000000001, -1.0000000000000000}, // k=32 + {-0.0490676743274180, -0.9987954562051724}, // k=33 + {-0.0980171403295606, -0.9951847266721969}, // k=34 + {-0.1467304744553616, -0.9891765099647810}, // k=35 + {-0.1950903220161282, -0.9807852804032304}, // k=36 + {-0.2429801799032639, -0.9700312531945440}, // k=37 + {-0.2902846772544622, -0.9569403357322089}, // k=38 + {-0.3368898533922199, -0.9415440651830208}, // k=39 + {-0.3826834323650897, -0.9238795325112867}, // k=40 + {-0.4275550934302819, -0.9039892931234434}, // k=41 + {-0.4713967368259977, -0.8819212643483550}, // k=42 + {-0.5141027441932217, -0.8577286100002721}, // k=43 + {-0.5555702330196020, -0.8314696123025455}, // k=44 + {-0.5956993044924334, -0.8032075314806449}, // k=45 + {-0.6343932841636454, -0.7730104533627371}, // k=46 + {-0.6715589548470184, -0.7409511253549590}, // k=47 + {-0.7071067811865475, -0.7071067811865476}, // k=48 + {-0.7409511253549589, -0.6715589548470186}, // k=49 + {-0.7730104533627370, -0.6343932841636455}, // k=50 + {-0.8032075314806448, -0.5956993044924335}, // k=51 + {-0.8314696123025453, -0.5555702330196022}, // k=52 + {-0.8577286100002720, -0.5141027441932218}, // k=53 + {-0.8819212643483549, -0.4713967368259979}, // k=54 + {-0.9039892931234433, -0.4275550934302820}, // k=55 + {-0.9238795325112867, -0.3826834323650899}, // k=56 + {-0.9415440651830207, -0.3368898533922203}, // k=57 + {-0.9569403357322088, -0.2902846772544624}, // k=58 + {-0.9700312531945440, -0.2429801799032641}, // k=59 + {-0.9807852804032304, -0.1950903220161286}, // k=60 + {-0.9891765099647810, -0.1467304744553618}, // k=61 + {-0.9951847266721968, -0.0980171403295608}, // k=62 + {-0.9987954562051724, -0.0490676743274180}, // k=63 + {-1.0000000000000000, -0.0000000000000001}, // k=64 + {-0.9987954562051724, 0.0490676743274177}, // k=65 + {-0.9951847266721969, 0.0980171403295606}, // k=66 + {-0.9891765099647810, 0.1467304744553616}, // k=67 + {-0.9807852804032304, 0.1950903220161284}, // k=68 + {-0.9700312531945440, 0.2429801799032638}, // k=69 + {-0.9569403357322089, 0.2902846772544621}, // k=70 + {-0.9415440651830208, 0.3368898533922201}, // k=71 + {-0.9238795325112868, 0.3826834323650897}, // k=72 + {-0.9039892931234434, 0.4275550934302818}, // k=73 + {-0.8819212643483550, 0.4713967368259976}, // k=74 + {-0.8577286100002721, 0.5141027441932216}, // k=75 + {-0.8314696123025455, 0.5555702330196020}, // k=76 + {-0.8032075314806449, 0.5956993044924332}, // k=77 + {-0.7730104533627371, 0.6343932841636453}, // k=78 + {-0.7409511253549591, 0.6715589548470184}, // k=79 + {-0.7071067811865477, 0.7071067811865475}, // k=80 + {-0.6715589548470187, 0.7409511253549589}, // k=81 + {-0.6343932841636459, 0.7730104533627367}, // k=82 + {-0.5956993044924331, 0.8032075314806451}, // k=83 + {-0.5555702330196022, 0.8314696123025452}, // k=84 + {-0.5141027441932218, 0.8577286100002720}, // k=85 + {-0.4713967368259979, 0.8819212643483549}, // k=86 + {-0.4275550934302825, 0.9039892931234431}, // k=87 + {-0.3826834323650903, 0.9238795325112865}, // k=88 + {-0.3368898533922199, 0.9415440651830208}, // k=89 + {-0.2902846772544624, 0.9569403357322088}, // k=90 + {-0.2429801799032641, 0.9700312531945440}, // k=91 + {-0.1950903220161287, 0.9807852804032303}, // k=92 + {-0.1467304744553623, 0.9891765099647809}, // k=93 + {-0.0980171403295605, 0.9951847266721969}, // k=94 + {-0.0490676743274180, 0.9987954562051724}, // k=95 + {-0.0000000000000002, 1.0000000000000000}, // k=96 + {0.0490676743274177, 0.9987954562051724}, // k=97 + {0.0980171403295601, 0.9951847266721969}, // k=98 + {0.1467304744553619, 0.9891765099647809}, // k=99 + {0.1950903220161283, 0.9807852804032304}, // k=100 + {0.2429801799032638, 0.9700312531945440}, // k=101 + {0.2902846772544621, 0.9569403357322089}, // k=102 + {0.3368898533922196, 0.9415440651830209}, // k=103 + {0.3826834323650900, 0.9238795325112866}, // k=104 + {0.4275550934302821, 0.9039892931234433}, // k=105 + {0.4713967368259976, 0.8819212643483550}, // k=106 + {0.5141027441932216, 0.8577286100002722}, // k=107 + {0.5555702330196018, 0.8314696123025455}, // k=108 + {0.5956993044924329, 0.8032075314806453}, // k=109 + {0.6343932841636456, 0.7730104533627369}, // k=110 + {0.6715589548470183, 0.7409511253549591}, // k=111 + {0.7071067811865474, 0.7071067811865477}, // k=112 + {0.7409511253549589, 0.6715589548470187}, // k=113 + {0.7730104533627367, 0.6343932841636459}, // k=114 + {0.8032075314806451, 0.5956993044924332}, // k=115 + {0.8314696123025452, 0.5555702330196022}, // k=116 + {0.8577286100002720, 0.5141027441932219}, // k=117 + {0.8819212643483548, 0.4713967368259979}, // k=118 + {0.9039892931234431, 0.4275550934302825}, // k=119 + {0.9238795325112865, 0.3826834323650904}, // k=120 + {0.9415440651830208, 0.3368898533922200}, // k=121 + {0.9569403357322088, 0.2902846772544625}, // k=122 + {0.9700312531945440, 0.2429801799032642}, // k=123 + {0.9807852804032303, 0.1950903220161287}, // k=124 + {0.9891765099647809, 0.1467304744553624}, // k=125 + {0.9951847266721969, 0.0980171403295605}, // k=126 + {0.9987954562051724, 0.0490676743274181}, // k=127 +}; + +__constant__ double2 c_twiddle_inv_128[128] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9987954562051724, 0.0490676743274180}, // k=1 + {0.9951847266721969, 0.0980171403295606}, // k=2 + {0.9891765099647810, 0.1467304744553617}, // k=3 + {0.9807852804032304, 0.1950903220161282}, // k=4 + {0.9700312531945440, 0.2429801799032639}, // k=5 + {0.9569403357322088, 0.2902846772544623}, // k=6 + {0.9415440651830208, 0.3368898533922201}, // k=7 + {0.9238795325112867, 0.3826834323650898}, // k=8 + {0.9039892931234433, 0.4275550934302821}, // k=9 + {0.8819212643483550, 0.4713967368259976}, // k=10 + {0.8577286100002721, 0.5141027441932217}, // k=11 + {0.8314696123025452, 0.5555702330196022}, // k=12 + {0.8032075314806449, 0.5956993044924334}, // k=13 + {0.7730104533627370, 0.6343932841636455}, // k=14 + {0.7409511253549591, 0.6715589548470183}, // k=15 + {0.7071067811865476, 0.7071067811865475}, // k=16 + {0.6715589548470183, 0.7409511253549591}, // k=17 + {0.6343932841636455, 0.7730104533627370}, // k=18 + {0.5956993044924335, 0.8032075314806448}, // k=19 + {0.5555702330196023, 0.8314696123025452}, // k=20 + {0.5141027441932217, 0.8577286100002721}, // k=21 + {0.4713967368259978, 0.8819212643483549}, // k=22 + {0.4275550934302822, 0.9039892931234433}, // k=23 + {0.3826834323650898, 0.9238795325112867}, // k=24 + {0.3368898533922201, 0.9415440651830208}, // k=25 + {0.2902846772544623, 0.9569403357322089}, // k=26 + {0.2429801799032640, 0.9700312531945440}, // k=27 + {0.1950903220161283, 0.9807852804032304}, // k=28 + {0.1467304744553617, 0.9891765099647810}, // k=29 + {0.0980171403295608, 0.9951847266721968}, // k=30 + {0.0490676743274181, 0.9987954562051724}, // k=31 + {0.0000000000000001, 1.0000000000000000}, // k=32 + {-0.0490676743274180, 0.9987954562051724}, // k=33 + {-0.0980171403295606, 0.9951847266721969}, // k=34 + {-0.1467304744553616, 0.9891765099647810}, // k=35 + {-0.1950903220161282, 0.9807852804032304}, // k=36 + {-0.2429801799032639, 0.9700312531945440}, // k=37 + {-0.2902846772544622, 0.9569403357322089}, // k=38 + {-0.3368898533922199, 0.9415440651830208}, // k=39 + {-0.3826834323650897, 0.9238795325112867}, // k=40 + {-0.4275550934302819, 0.9039892931234434}, // k=41 + {-0.4713967368259977, 0.8819212643483550}, // k=42 + {-0.5141027441932217, 0.8577286100002721}, // k=43 + {-0.5555702330196020, 0.8314696123025455}, // k=44 + {-0.5956993044924334, 0.8032075314806449}, // k=45 + {-0.6343932841636454, 0.7730104533627371}, // k=46 + {-0.6715589548470184, 0.7409511253549590}, // k=47 + {-0.7071067811865475, 0.7071067811865476}, // k=48 + {-0.7409511253549589, 0.6715589548470186}, // k=49 + {-0.7730104533627370, 0.6343932841636455}, // k=50 + {-0.8032075314806448, 0.5956993044924335}, // k=51 + {-0.8314696123025453, 0.5555702330196022}, // k=52 + {-0.8577286100002720, 0.5141027441932218}, // k=53 + {-0.8819212643483549, 0.4713967368259979}, // k=54 + {-0.9039892931234433, 0.4275550934302820}, // k=55 + {-0.9238795325112867, 0.3826834323650899}, // k=56 + {-0.9415440651830207, 0.3368898533922203}, // k=57 + {-0.9569403357322088, 0.2902846772544624}, // k=58 + {-0.9700312531945440, 0.2429801799032641}, // k=59 + {-0.9807852804032304, 0.1950903220161286}, // k=60 + {-0.9891765099647810, 0.1467304744553618}, // k=61 + {-0.9951847266721968, 0.0980171403295608}, // k=62 + {-0.9987954562051724, 0.0490676743274180}, // k=63 + {-1.0000000000000000, 0.0000000000000001}, // k=64 + {-0.9987954562051724, -0.0490676743274177}, // k=65 + {-0.9951847266721969, -0.0980171403295606}, // k=66 + {-0.9891765099647810, -0.1467304744553616}, // k=67 + {-0.9807852804032304, -0.1950903220161284}, // k=68 + {-0.9700312531945440, -0.2429801799032638}, // k=69 + {-0.9569403357322089, -0.2902846772544621}, // k=70 + {-0.9415440651830208, -0.3368898533922201}, // k=71 + {-0.9238795325112868, -0.3826834323650897}, // k=72 + {-0.9039892931234434, -0.4275550934302818}, // k=73 + {-0.8819212643483550, -0.4713967368259976}, // k=74 + {-0.8577286100002721, -0.5141027441932216}, // k=75 + {-0.8314696123025455, -0.5555702330196020}, // k=76 + {-0.8032075314806449, -0.5956993044924332}, // k=77 + {-0.7730104533627371, -0.6343932841636453}, // k=78 + {-0.7409511253549591, -0.6715589548470184}, // k=79 + {-0.7071067811865477, -0.7071067811865475}, // k=80 + {-0.6715589548470187, -0.7409511253549589}, // k=81 + {-0.6343932841636459, -0.7730104533627367}, // k=82 + {-0.5956993044924331, -0.8032075314806451}, // k=83 + {-0.5555702330196022, -0.8314696123025452}, // k=84 + {-0.5141027441932218, -0.8577286100002720}, // k=85 + {-0.4713967368259979, -0.8819212643483549}, // k=86 + {-0.4275550934302825, -0.9039892931234431}, // k=87 + {-0.3826834323650903, -0.9238795325112865}, // k=88 + {-0.3368898533922199, -0.9415440651830208}, // k=89 + {-0.2902846772544624, -0.9569403357322088}, // k=90 + {-0.2429801799032641, -0.9700312531945440}, // k=91 + {-0.1950903220161287, -0.9807852804032303}, // k=92 + {-0.1467304744553623, -0.9891765099647809}, // k=93 + {-0.0980171403295605, -0.9951847266721969}, // k=94 + {-0.0490676743274180, -0.9987954562051724}, // k=95 + {-0.0000000000000002, -1.0000000000000000}, // k=96 + {0.0490676743274177, -0.9987954562051724}, // k=97 + {0.0980171403295601, -0.9951847266721969}, // k=98 + {0.1467304744553619, -0.9891765099647809}, // k=99 + {0.1950903220161283, -0.9807852804032304}, // k=100 + {0.2429801799032638, -0.9700312531945440}, // k=101 + {0.2902846772544621, -0.9569403357322089}, // k=102 + {0.3368898533922196, -0.9415440651830209}, // k=103 + {0.3826834323650900, -0.9238795325112866}, // k=104 + {0.4275550934302821, -0.9039892931234433}, // k=105 + {0.4713967368259976, -0.8819212643483550}, // k=106 + {0.5141027441932216, -0.8577286100002722}, // k=107 + {0.5555702330196018, -0.8314696123025455}, // k=108 + {0.5956993044924329, -0.8032075314806453}, // k=109 + {0.6343932841636456, -0.7730104533627369}, // k=110 + {0.6715589548470183, -0.7409511253549591}, // k=111 + {0.7071067811865474, -0.7071067811865477}, // k=112 + {0.7409511253549589, -0.6715589548470187}, // k=113 + {0.7730104533627367, -0.6343932841636459}, // k=114 + {0.8032075314806451, -0.5956993044924332}, // k=115 + {0.8314696123025452, -0.5555702330196022}, // k=116 + {0.8577286100002720, -0.5141027441932219}, // k=117 + {0.8819212643483548, -0.4713967368259979}, // k=118 + {0.9039892931234431, -0.4275550934302825}, // k=119 + {0.9238795325112865, -0.3826834323650904}, // k=120 + {0.9415440651830208, -0.3368898533922200}, // k=121 + {0.9569403357322088, -0.2902846772544625}, // k=122 + {0.9700312531945440, -0.2429801799032642}, // k=123 + {0.9807852804032303, -0.1950903220161287}, // k=124 + {0.9891765099647809, -0.1467304744553624}, // k=125 + {0.9951847266721969, -0.0980171403295605}, // k=126 + {0.9987954562051724, -0.0490676743274181}, // k=127 +}; +__constant__ double2 c_twiddle_fwd_256[256] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9996988186962042, -0.0245412285229123}, // k=1 + {0.9987954562051724, -0.0490676743274180}, // k=2 + {0.9972904566786902, -0.0735645635996674}, // k=3 + {0.9951847266721969, -0.0980171403295606}, // k=4 + {0.9924795345987100, -0.1224106751992162}, // k=5 + {0.9891765099647810, -0.1467304744553617}, // k=6 + {0.9852776423889412, -0.1709618887603012}, // k=7 + {0.9807852804032304, -0.1950903220161282}, // k=8 + {0.9757021300385286, -0.2191012401568698}, // k=9 + {0.9700312531945440, -0.2429801799032639}, // k=10 + {0.9637760657954398, -0.2667127574748984}, // k=11 + {0.9569403357322088, -0.2902846772544623}, // k=12 + {0.9495281805930367, -0.3136817403988915}, // k=13 + {0.9415440651830208, -0.3368898533922201}, // k=14 + {0.9329927988347390, -0.3598950365349881}, // k=15 + {0.9238795325112867, -0.3826834323650898}, // k=16 + {0.9142097557035307, -0.4052413140049899}, // k=17 + {0.9039892931234433, -0.4275550934302821}, // k=18 + {0.8932243011955153, -0.4496113296546065}, // k=19 + {0.8819212643483550, -0.4713967368259976}, // k=20 + {0.8700869911087115, -0.4928981922297840}, // k=21 + {0.8577286100002721, -0.5141027441932217}, // k=22 + {0.8448535652497071, -0.5349976198870972}, // k=23 + {0.8314696123025452, -0.5555702330196022}, // k=24 + {0.8175848131515837, -0.5758081914178453}, // k=25 + {0.8032075314806449, -0.5956993044924334}, // k=26 + {0.7883464276266063, -0.6152315905806268}, // k=27 + {0.7730104533627370, -0.6343932841636455}, // k=28 + {0.7572088465064846, -0.6531728429537768}, // k=29 + {0.7409511253549591, -0.6715589548470183}, // k=30 + {0.7242470829514670, -0.6895405447370668}, // k=31 + {0.7071067811865476, -0.7071067811865475}, // k=32 + {0.6895405447370669, -0.7242470829514669}, // k=33 + {0.6715589548470183, -0.7409511253549591}, // k=34 + {0.6531728429537768, -0.7572088465064845}, // k=35 + {0.6343932841636455, -0.7730104533627370}, // k=36 + {0.6152315905806268, -0.7883464276266062}, // k=37 + {0.5956993044924335, -0.8032075314806448}, // k=38 + {0.5758081914178453, -0.8175848131515837}, // k=39 + {0.5555702330196023, -0.8314696123025452}, // k=40 + {0.5349976198870973, -0.8448535652497070}, // k=41 + {0.5141027441932217, -0.8577286100002721}, // k=42 + {0.4928981922297841, -0.8700869911087113}, // k=43 + {0.4713967368259978, -0.8819212643483549}, // k=44 + {0.4496113296546066, -0.8932243011955153}, // k=45 + {0.4275550934302822, -0.9039892931234433}, // k=46 + {0.4052413140049899, -0.9142097557035307}, // k=47 + {0.3826834323650898, -0.9238795325112867}, // k=48 + {0.3598950365349883, -0.9329927988347388}, // k=49 + {0.3368898533922201, -0.9415440651830208}, // k=50 + {0.3136817403988916, -0.9495281805930367}, // k=51 + {0.2902846772544623, -0.9569403357322089}, // k=52 + {0.2667127574748984, -0.9637760657954398}, // k=53 + {0.2429801799032640, -0.9700312531945440}, // k=54 + {0.2191012401568698, -0.9757021300385286}, // k=55 + {0.1950903220161283, -0.9807852804032304}, // k=56 + {0.1709618887603014, -0.9852776423889412}, // k=57 + {0.1467304744553617, -0.9891765099647810}, // k=58 + {0.1224106751992163, -0.9924795345987100}, // k=59 + {0.0980171403295608, -0.9951847266721968}, // k=60 + {0.0735645635996675, -0.9972904566786902}, // k=61 + {0.0490676743274181, -0.9987954562051724}, // k=62 + {0.0245412285229123, -0.9996988186962042}, // k=63 + {0.0000000000000001, -1.0000000000000000}, // k=64 + {-0.0245412285229121, -0.9996988186962042}, // k=65 + {-0.0490676743274180, -0.9987954562051724}, // k=66 + {-0.0735645635996673, -0.9972904566786902}, // k=67 + {-0.0980171403295606, -0.9951847266721969}, // k=68 + {-0.1224106751992162, -0.9924795345987100}, // k=69 + {-0.1467304744553616, -0.9891765099647810}, // k=70 + {-0.1709618887603012, -0.9852776423889412}, // k=71 + {-0.1950903220161282, -0.9807852804032304}, // k=72 + {-0.2191012401568697, -0.9757021300385286}, // k=73 + {-0.2429801799032639, -0.9700312531945440}, // k=74 + {-0.2667127574748983, -0.9637760657954398}, // k=75 + {-0.2902846772544622, -0.9569403357322089}, // k=76 + {-0.3136817403988914, -0.9495281805930367}, // k=77 + {-0.3368898533922199, -0.9415440651830208}, // k=78 + {-0.3598950365349882, -0.9329927988347388}, // k=79 + {-0.3826834323650897, -0.9238795325112867}, // k=80 + {-0.4052413140049897, -0.9142097557035307}, // k=81 + {-0.4275550934302819, -0.9039892931234434}, // k=82 + {-0.4496113296546067, -0.8932243011955152}, // k=83 + {-0.4713967368259977, -0.8819212643483550}, // k=84 + {-0.4928981922297840, -0.8700869911087115}, // k=85 + {-0.5141027441932217, -0.8577286100002721}, // k=86 + {-0.5349976198870970, -0.8448535652497072}, // k=87 + {-0.5555702330196020, -0.8314696123025455}, // k=88 + {-0.5758081914178453, -0.8175848131515837}, // k=89 + {-0.5956993044924334, -0.8032075314806449}, // k=90 + {-0.6152315905806267, -0.7883464276266063}, // k=91 + {-0.6343932841636454, -0.7730104533627371}, // k=92 + {-0.6531728429537765, -0.7572088465064847}, // k=93 + {-0.6715589548470184, -0.7409511253549590}, // k=94 + {-0.6895405447370669, -0.7242470829514669}, // k=95 + {-0.7071067811865475, -0.7071067811865476}, // k=96 + {-0.7242470829514668, -0.6895405447370671}, // k=97 + {-0.7409511253549589, -0.6715589548470186}, // k=98 + {-0.7572088465064846, -0.6531728429537766}, // k=99 + {-0.7730104533627370, -0.6343932841636455}, // k=100 + {-0.7883464276266062, -0.6152315905806269}, // k=101 + {-0.8032075314806448, -0.5956993044924335}, // k=102 + {-0.8175848131515836, -0.5758081914178454}, // k=103 + {-0.8314696123025453, -0.5555702330196022}, // k=104 + {-0.8448535652497071, -0.5349976198870972}, // k=105 + {-0.8577286100002720, -0.5141027441932218}, // k=106 + {-0.8700869911087113, -0.4928981922297841}, // k=107 + {-0.8819212643483549, -0.4713967368259979}, // k=108 + {-0.8932243011955152, -0.4496113296546069}, // k=109 + {-0.9039892931234433, -0.4275550934302820}, // k=110 + {-0.9142097557035307, -0.4052413140049899}, // k=111 + {-0.9238795325112867, -0.3826834323650899}, // k=112 + {-0.9329927988347388, -0.3598950365349883}, // k=113 + {-0.9415440651830207, -0.3368898533922203}, // k=114 + {-0.9495281805930367, -0.3136817403988914}, // k=115 + {-0.9569403357322088, -0.2902846772544624}, // k=116 + {-0.9637760657954398, -0.2667127574748985}, // k=117 + {-0.9700312531945440, -0.2429801799032641}, // k=118 + {-0.9757021300385285, -0.2191012401568700}, // k=119 + {-0.9807852804032304, -0.1950903220161286}, // k=120 + {-0.9852776423889412, -0.1709618887603012}, // k=121 + {-0.9891765099647810, -0.1467304744553618}, // k=122 + {-0.9924795345987100, -0.1224106751992163}, // k=123 + {-0.9951847266721968, -0.0980171403295608}, // k=124 + {-0.9972904566786902, -0.0735645635996677}, // k=125 + {-0.9987954562051724, -0.0490676743274180}, // k=126 + {-0.9996988186962042, -0.0245412285229123}, // k=127 + {-1.0000000000000000, -0.0000000000000001}, // k=128 + {-0.9996988186962042, 0.0245412285229121}, // k=129 + {-0.9987954562051724, 0.0490676743274177}, // k=130 + {-0.9972904566786902, 0.0735645635996675}, // k=131 + {-0.9951847266721969, 0.0980171403295606}, // k=132 + {-0.9924795345987100, 0.1224106751992161}, // k=133 + {-0.9891765099647810, 0.1467304744553616}, // k=134 + {-0.9852776423889413, 0.1709618887603010}, // k=135 + {-0.9807852804032304, 0.1950903220161284}, // k=136 + {-0.9757021300385286, 0.2191012401568698}, // k=137 + {-0.9700312531945440, 0.2429801799032638}, // k=138 + {-0.9637760657954400, 0.2667127574748983}, // k=139 + {-0.9569403357322089, 0.2902846772544621}, // k=140 + {-0.9495281805930368, 0.3136817403988912}, // k=141 + {-0.9415440651830208, 0.3368898533922201}, // k=142 + {-0.9329927988347390, 0.3598950365349881}, // k=143 + {-0.9238795325112868, 0.3826834323650897}, // k=144 + {-0.9142097557035307, 0.4052413140049897}, // k=145 + {-0.9039892931234434, 0.4275550934302818}, // k=146 + {-0.8932243011955153, 0.4496113296546067}, // k=147 + {-0.8819212643483550, 0.4713967368259976}, // k=148 + {-0.8700869911087115, 0.4928981922297839}, // k=149 + {-0.8577286100002721, 0.5141027441932216}, // k=150 + {-0.8448535652497072, 0.5349976198870969}, // k=151 + {-0.8314696123025455, 0.5555702330196020}, // k=152 + {-0.8175848131515837, 0.5758081914178453}, // k=153 + {-0.8032075314806449, 0.5956993044924332}, // k=154 + {-0.7883464276266063, 0.6152315905806267}, // k=155 + {-0.7730104533627371, 0.6343932841636453}, // k=156 + {-0.7572088465064848, 0.6531728429537765}, // k=157 + {-0.7409511253549591, 0.6715589548470184}, // k=158 + {-0.7242470829514670, 0.6895405447370668}, // k=159 + {-0.7071067811865477, 0.7071067811865475}, // k=160 + {-0.6895405447370671, 0.7242470829514668}, // k=161 + {-0.6715589548470187, 0.7409511253549589}, // k=162 + {-0.6531728429537771, 0.7572088465064842}, // k=163 + {-0.6343932841636459, 0.7730104533627367}, // k=164 + {-0.6152315905806273, 0.7883464276266059}, // k=165 + {-0.5956993044924331, 0.8032075314806451}, // k=166 + {-0.5758081914178452, 0.8175848131515838}, // k=167 + {-0.5555702330196022, 0.8314696123025452}, // k=168 + {-0.5349976198870973, 0.8448535652497070}, // k=169 + {-0.5141027441932218, 0.8577286100002720}, // k=170 + {-0.4928981922297842, 0.8700869911087113}, // k=171 + {-0.4713967368259979, 0.8819212643483549}, // k=172 + {-0.4496113296546069, 0.8932243011955152}, // k=173 + {-0.4275550934302825, 0.9039892931234431}, // k=174 + {-0.4052413140049904, 0.9142097557035305}, // k=175 + {-0.3826834323650903, 0.9238795325112865}, // k=176 + {-0.3598950365349879, 0.9329927988347390}, // k=177 + {-0.3368898533922199, 0.9415440651830208}, // k=178 + {-0.3136817403988915, 0.9495281805930367}, // k=179 + {-0.2902846772544624, 0.9569403357322088}, // k=180 + {-0.2667127574748985, 0.9637760657954398}, // k=181 + {-0.2429801799032641, 0.9700312531945440}, // k=182 + {-0.2191012401568701, 0.9757021300385285}, // k=183 + {-0.1950903220161287, 0.9807852804032303}, // k=184 + {-0.1709618887603017, 0.9852776423889411}, // k=185 + {-0.1467304744553623, 0.9891765099647809}, // k=186 + {-0.1224106751992160, 0.9924795345987101}, // k=187 + {-0.0980171403295605, 0.9951847266721969}, // k=188 + {-0.0735645635996674, 0.9972904566786902}, // k=189 + {-0.0490676743274180, 0.9987954562051724}, // k=190 + {-0.0245412285229124, 0.9996988186962042}, // k=191 + {-0.0000000000000002, 1.0000000000000000}, // k=192 + {0.0245412285229120, 0.9996988186962042}, // k=193 + {0.0490676743274177, 0.9987954562051724}, // k=194 + {0.0735645635996670, 0.9972904566786902}, // k=195 + {0.0980171403295601, 0.9951847266721969}, // k=196 + {0.1224106751992156, 0.9924795345987101}, // k=197 + {0.1467304744553619, 0.9891765099647809}, // k=198 + {0.1709618887603013, 0.9852776423889412}, // k=199 + {0.1950903220161283, 0.9807852804032304}, // k=200 + {0.2191012401568697, 0.9757021300385286}, // k=201 + {0.2429801799032638, 0.9700312531945440}, // k=202 + {0.2667127574748982, 0.9637760657954400}, // k=203 + {0.2902846772544621, 0.9569403357322089}, // k=204 + {0.3136817403988911, 0.9495281805930368}, // k=205 + {0.3368898533922196, 0.9415440651830209}, // k=206 + {0.3598950365349876, 0.9329927988347391}, // k=207 + {0.3826834323650900, 0.9238795325112866}, // k=208 + {0.4052413140049900, 0.9142097557035306}, // k=209 + {0.4275550934302821, 0.9039892931234433}, // k=210 + {0.4496113296546066, 0.8932243011955153}, // k=211 + {0.4713967368259976, 0.8819212643483550}, // k=212 + {0.4928981922297839, 0.8700869911087115}, // k=213 + {0.5141027441932216, 0.8577286100002722}, // k=214 + {0.5349976198870969, 0.8448535652497072}, // k=215 + {0.5555702330196018, 0.8314696123025455}, // k=216 + {0.5758081914178449, 0.8175848131515840}, // k=217 + {0.5956993044924329, 0.8032075314806453}, // k=218 + {0.6152315905806270, 0.7883464276266061}, // k=219 + {0.6343932841636456, 0.7730104533627369}, // k=220 + {0.6531728429537768, 0.7572088465064846}, // k=221 + {0.6715589548470183, 0.7409511253549591}, // k=222 + {0.6895405447370668, 0.7242470829514670}, // k=223 + {0.7071067811865474, 0.7071067811865477}, // k=224 + {0.7242470829514667, 0.6895405447370672}, // k=225 + {0.7409511253549589, 0.6715589548470187}, // k=226 + {0.7572088465064842, 0.6531728429537771}, // k=227 + {0.7730104533627367, 0.6343932841636459}, // k=228 + {0.7883464276266059, 0.6152315905806274}, // k=229 + {0.8032075314806451, 0.5956993044924332}, // k=230 + {0.8175848131515837, 0.5758081914178452}, // k=231 + {0.8314696123025452, 0.5555702330196022}, // k=232 + {0.8448535652497070, 0.5349976198870973}, // k=233 + {0.8577286100002720, 0.5141027441932219}, // k=234 + {0.8700869911087113, 0.4928981922297843}, // k=235 + {0.8819212643483548, 0.4713967368259979}, // k=236 + {0.8932243011955151, 0.4496113296546070}, // k=237 + {0.9039892931234431, 0.4275550934302825}, // k=238 + {0.9142097557035305, 0.4052413140049904}, // k=239 + {0.9238795325112865, 0.3826834323650904}, // k=240 + {0.9329927988347390, 0.3598950365349880}, // k=241 + {0.9415440651830208, 0.3368898533922200}, // k=242 + {0.9495281805930367, 0.3136817403988915}, // k=243 + {0.9569403357322088, 0.2902846772544625}, // k=244 + {0.9637760657954398, 0.2667127574748986}, // k=245 + {0.9700312531945440, 0.2429801799032642}, // k=246 + {0.9757021300385285, 0.2191012401568702}, // k=247 + {0.9807852804032303, 0.1950903220161287}, // k=248 + {0.9852776423889411, 0.1709618887603018}, // k=249 + {0.9891765099647809, 0.1467304744553624}, // k=250 + {0.9924795345987100, 0.1224106751992160}, // k=251 + {0.9951847266721969, 0.0980171403295605}, // k=252 + {0.9972904566786902, 0.0735645635996674}, // k=253 + {0.9987954562051724, 0.0490676743274181}, // k=254 + {0.9996988186962042, 0.0245412285229124}, // k=255 +}; + +__constant__ double2 c_twiddle_inv_256[256] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9996988186962042, 0.0245412285229123}, // k=1 + {0.9987954562051724, 0.0490676743274180}, // k=2 + {0.9972904566786902, 0.0735645635996674}, // k=3 + {0.9951847266721969, 0.0980171403295606}, // k=4 + {0.9924795345987100, 0.1224106751992162}, // k=5 + {0.9891765099647810, 0.1467304744553617}, // k=6 + {0.9852776423889412, 0.1709618887603012}, // k=7 + {0.9807852804032304, 0.1950903220161282}, // k=8 + {0.9757021300385286, 0.2191012401568698}, // k=9 + {0.9700312531945440, 0.2429801799032639}, // k=10 + {0.9637760657954398, 0.2667127574748984}, // k=11 + {0.9569403357322088, 0.2902846772544623}, // k=12 + {0.9495281805930367, 0.3136817403988915}, // k=13 + {0.9415440651830208, 0.3368898533922201}, // k=14 + {0.9329927988347390, 0.3598950365349881}, // k=15 + {0.9238795325112867, 0.3826834323650898}, // k=16 + {0.9142097557035307, 0.4052413140049899}, // k=17 + {0.9039892931234433, 0.4275550934302821}, // k=18 + {0.8932243011955153, 0.4496113296546065}, // k=19 + {0.8819212643483550, 0.4713967368259976}, // k=20 + {0.8700869911087115, 0.4928981922297840}, // k=21 + {0.8577286100002721, 0.5141027441932217}, // k=22 + {0.8448535652497071, 0.5349976198870972}, // k=23 + {0.8314696123025452, 0.5555702330196022}, // k=24 + {0.8175848131515837, 0.5758081914178453}, // k=25 + {0.8032075314806449, 0.5956993044924334}, // k=26 + {0.7883464276266063, 0.6152315905806268}, // k=27 + {0.7730104533627370, 0.6343932841636455}, // k=28 + {0.7572088465064846, 0.6531728429537768}, // k=29 + {0.7409511253549591, 0.6715589548470183}, // k=30 + {0.7242470829514670, 0.6895405447370668}, // k=31 + {0.7071067811865476, 0.7071067811865475}, // k=32 + {0.6895405447370669, 0.7242470829514669}, // k=33 + {0.6715589548470183, 0.7409511253549591}, // k=34 + {0.6531728429537768, 0.7572088465064845}, // k=35 + {0.6343932841636455, 0.7730104533627370}, // k=36 + {0.6152315905806268, 0.7883464276266062}, // k=37 + {0.5956993044924335, 0.8032075314806448}, // k=38 + {0.5758081914178453, 0.8175848131515837}, // k=39 + {0.5555702330196023, 0.8314696123025452}, // k=40 + {0.5349976198870973, 0.8448535652497070}, // k=41 + {0.5141027441932217, 0.8577286100002721}, // k=42 + {0.4928981922297841, 0.8700869911087113}, // k=43 + {0.4713967368259978, 0.8819212643483549}, // k=44 + {0.4496113296546066, 0.8932243011955153}, // k=45 + {0.4275550934302822, 0.9039892931234433}, // k=46 + {0.4052413140049899, 0.9142097557035307}, // k=47 + {0.3826834323650898, 0.9238795325112867}, // k=48 + {0.3598950365349883, 0.9329927988347388}, // k=49 + {0.3368898533922201, 0.9415440651830208}, // k=50 + {0.3136817403988916, 0.9495281805930367}, // k=51 + {0.2902846772544623, 0.9569403357322089}, // k=52 + {0.2667127574748984, 0.9637760657954398}, // k=53 + {0.2429801799032640, 0.9700312531945440}, // k=54 + {0.2191012401568698, 0.9757021300385286}, // k=55 + {0.1950903220161283, 0.9807852804032304}, // k=56 + {0.1709618887603014, 0.9852776423889412}, // k=57 + {0.1467304744553617, 0.9891765099647810}, // k=58 + {0.1224106751992163, 0.9924795345987100}, // k=59 + {0.0980171403295608, 0.9951847266721968}, // k=60 + {0.0735645635996675, 0.9972904566786902}, // k=61 + {0.0490676743274181, 0.9987954562051724}, // k=62 + {0.0245412285229123, 0.9996988186962042}, // k=63 + {0.0000000000000001, 1.0000000000000000}, // k=64 + {-0.0245412285229121, 0.9996988186962042}, // k=65 + {-0.0490676743274180, 0.9987954562051724}, // k=66 + {-0.0735645635996673, 0.9972904566786902}, // k=67 + {-0.0980171403295606, 0.9951847266721969}, // k=68 + {-0.1224106751992162, 0.9924795345987100}, // k=69 + {-0.1467304744553616, 0.9891765099647810}, // k=70 + {-0.1709618887603012, 0.9852776423889412}, // k=71 + {-0.1950903220161282, 0.9807852804032304}, // k=72 + {-0.2191012401568697, 0.9757021300385286}, // k=73 + {-0.2429801799032639, 0.9700312531945440}, // k=74 + {-0.2667127574748983, 0.9637760657954398}, // k=75 + {-0.2902846772544622, 0.9569403357322089}, // k=76 + {-0.3136817403988914, 0.9495281805930367}, // k=77 + {-0.3368898533922199, 0.9415440651830208}, // k=78 + {-0.3598950365349882, 0.9329927988347388}, // k=79 + {-0.3826834323650897, 0.9238795325112867}, // k=80 + {-0.4052413140049897, 0.9142097557035307}, // k=81 + {-0.4275550934302819, 0.9039892931234434}, // k=82 + {-0.4496113296546067, 0.8932243011955152}, // k=83 + {-0.4713967368259977, 0.8819212643483550}, // k=84 + {-0.4928981922297840, 0.8700869911087115}, // k=85 + {-0.5141027441932217, 0.8577286100002721}, // k=86 + {-0.5349976198870970, 0.8448535652497072}, // k=87 + {-0.5555702330196020, 0.8314696123025455}, // k=88 + {-0.5758081914178453, 0.8175848131515837}, // k=89 + {-0.5956993044924334, 0.8032075314806449}, // k=90 + {-0.6152315905806267, 0.7883464276266063}, // k=91 + {-0.6343932841636454, 0.7730104533627371}, // k=92 + {-0.6531728429537765, 0.7572088465064847}, // k=93 + {-0.6715589548470184, 0.7409511253549590}, // k=94 + {-0.6895405447370669, 0.7242470829514669}, // k=95 + {-0.7071067811865475, 0.7071067811865476}, // k=96 + {-0.7242470829514668, 0.6895405447370671}, // k=97 + {-0.7409511253549589, 0.6715589548470186}, // k=98 + {-0.7572088465064846, 0.6531728429537766}, // k=99 + {-0.7730104533627370, 0.6343932841636455}, // k=100 + {-0.7883464276266062, 0.6152315905806269}, // k=101 + {-0.8032075314806448, 0.5956993044924335}, // k=102 + {-0.8175848131515836, 0.5758081914178454}, // k=103 + {-0.8314696123025453, 0.5555702330196022}, // k=104 + {-0.8448535652497071, 0.5349976198870972}, // k=105 + {-0.8577286100002720, 0.5141027441932218}, // k=106 + {-0.8700869911087113, 0.4928981922297841}, // k=107 + {-0.8819212643483549, 0.4713967368259979}, // k=108 + {-0.8932243011955152, 0.4496113296546069}, // k=109 + {-0.9039892931234433, 0.4275550934302820}, // k=110 + {-0.9142097557035307, 0.4052413140049899}, // k=111 + {-0.9238795325112867, 0.3826834323650899}, // k=112 + {-0.9329927988347388, 0.3598950365349883}, // k=113 + {-0.9415440651830207, 0.3368898533922203}, // k=114 + {-0.9495281805930367, 0.3136817403988914}, // k=115 + {-0.9569403357322088, 0.2902846772544624}, // k=116 + {-0.9637760657954398, 0.2667127574748985}, // k=117 + {-0.9700312531945440, 0.2429801799032641}, // k=118 + {-0.9757021300385285, 0.2191012401568700}, // k=119 + {-0.9807852804032304, 0.1950903220161286}, // k=120 + {-0.9852776423889412, 0.1709618887603012}, // k=121 + {-0.9891765099647810, 0.1467304744553618}, // k=122 + {-0.9924795345987100, 0.1224106751992163}, // k=123 + {-0.9951847266721968, 0.0980171403295608}, // k=124 + {-0.9972904566786902, 0.0735645635996677}, // k=125 + {-0.9987954562051724, 0.0490676743274180}, // k=126 + {-0.9996988186962042, 0.0245412285229123}, // k=127 + {-1.0000000000000000, 0.0000000000000001}, // k=128 + {-0.9996988186962042, -0.0245412285229121}, // k=129 + {-0.9987954562051724, -0.0490676743274177}, // k=130 + {-0.9972904566786902, -0.0735645635996675}, // k=131 + {-0.9951847266721969, -0.0980171403295606}, // k=132 + {-0.9924795345987100, -0.1224106751992161}, // k=133 + {-0.9891765099647810, -0.1467304744553616}, // k=134 + {-0.9852776423889413, -0.1709618887603010}, // k=135 + {-0.9807852804032304, -0.1950903220161284}, // k=136 + {-0.9757021300385286, -0.2191012401568698}, // k=137 + {-0.9700312531945440, -0.2429801799032638}, // k=138 + {-0.9637760657954400, -0.2667127574748983}, // k=139 + {-0.9569403357322089, -0.2902846772544621}, // k=140 + {-0.9495281805930368, -0.3136817403988912}, // k=141 + {-0.9415440651830208, -0.3368898533922201}, // k=142 + {-0.9329927988347390, -0.3598950365349881}, // k=143 + {-0.9238795325112868, -0.3826834323650897}, // k=144 + {-0.9142097557035307, -0.4052413140049897}, // k=145 + {-0.9039892931234434, -0.4275550934302818}, // k=146 + {-0.8932243011955153, -0.4496113296546067}, // k=147 + {-0.8819212643483550, -0.4713967368259976}, // k=148 + {-0.8700869911087115, -0.4928981922297839}, // k=149 + {-0.8577286100002721, -0.5141027441932216}, // k=150 + {-0.8448535652497072, -0.5349976198870969}, // k=151 + {-0.8314696123025455, -0.5555702330196020}, // k=152 + {-0.8175848131515837, -0.5758081914178453}, // k=153 + {-0.8032075314806449, -0.5956993044924332}, // k=154 + {-0.7883464276266063, -0.6152315905806267}, // k=155 + {-0.7730104533627371, -0.6343932841636453}, // k=156 + {-0.7572088465064848, -0.6531728429537765}, // k=157 + {-0.7409511253549591, -0.6715589548470184}, // k=158 + {-0.7242470829514670, -0.6895405447370668}, // k=159 + {-0.7071067811865477, -0.7071067811865475}, // k=160 + {-0.6895405447370671, -0.7242470829514668}, // k=161 + {-0.6715589548470187, -0.7409511253549589}, // k=162 + {-0.6531728429537771, -0.7572088465064842}, // k=163 + {-0.6343932841636459, -0.7730104533627367}, // k=164 + {-0.6152315905806273, -0.7883464276266059}, // k=165 + {-0.5956993044924331, -0.8032075314806451}, // k=166 + {-0.5758081914178452, -0.8175848131515838}, // k=167 + {-0.5555702330196022, -0.8314696123025452}, // k=168 + {-0.5349976198870973, -0.8448535652497070}, // k=169 + {-0.5141027441932218, -0.8577286100002720}, // k=170 + {-0.4928981922297842, -0.8700869911087113}, // k=171 + {-0.4713967368259979, -0.8819212643483549}, // k=172 + {-0.4496113296546069, -0.8932243011955152}, // k=173 + {-0.4275550934302825, -0.9039892931234431}, // k=174 + {-0.4052413140049904, -0.9142097557035305}, // k=175 + {-0.3826834323650903, -0.9238795325112865}, // k=176 + {-0.3598950365349879, -0.9329927988347390}, // k=177 + {-0.3368898533922199, -0.9415440651830208}, // k=178 + {-0.3136817403988915, -0.9495281805930367}, // k=179 + {-0.2902846772544624, -0.9569403357322088}, // k=180 + {-0.2667127574748985, -0.9637760657954398}, // k=181 + {-0.2429801799032641, -0.9700312531945440}, // k=182 + {-0.2191012401568701, -0.9757021300385285}, // k=183 + {-0.1950903220161287, -0.9807852804032303}, // k=184 + {-0.1709618887603017, -0.9852776423889411}, // k=185 + {-0.1467304744553623, -0.9891765099647809}, // k=186 + {-0.1224106751992160, -0.9924795345987101}, // k=187 + {-0.0980171403295605, -0.9951847266721969}, // k=188 + {-0.0735645635996674, -0.9972904566786902}, // k=189 + {-0.0490676743274180, -0.9987954562051724}, // k=190 + {-0.0245412285229124, -0.9996988186962042}, // k=191 + {-0.0000000000000002, -1.0000000000000000}, // k=192 + {0.0245412285229120, -0.9996988186962042}, // k=193 + {0.0490676743274177, -0.9987954562051724}, // k=194 + {0.0735645635996670, -0.9972904566786902}, // k=195 + {0.0980171403295601, -0.9951847266721969}, // k=196 + {0.1224106751992156, -0.9924795345987101}, // k=197 + {0.1467304744553619, -0.9891765099647809}, // k=198 + {0.1709618887603013, -0.9852776423889412}, // k=199 + {0.1950903220161283, -0.9807852804032304}, // k=200 + {0.2191012401568697, -0.9757021300385286}, // k=201 + {0.2429801799032638, -0.9700312531945440}, // k=202 + {0.2667127574748982, -0.9637760657954400}, // k=203 + {0.2902846772544621, -0.9569403357322089}, // k=204 + {0.3136817403988911, -0.9495281805930368}, // k=205 + {0.3368898533922196, -0.9415440651830209}, // k=206 + {0.3598950365349876, -0.9329927988347391}, // k=207 + {0.3826834323650900, -0.9238795325112866}, // k=208 + {0.4052413140049900, -0.9142097557035306}, // k=209 + {0.4275550934302821, -0.9039892931234433}, // k=210 + {0.4496113296546066, -0.8932243011955153}, // k=211 + {0.4713967368259976, -0.8819212643483550}, // k=212 + {0.4928981922297839, -0.8700869911087115}, // k=213 + {0.5141027441932216, -0.8577286100002722}, // k=214 + {0.5349976198870969, -0.8448535652497072}, // k=215 + {0.5555702330196018, -0.8314696123025455}, // k=216 + {0.5758081914178449, -0.8175848131515840}, // k=217 + {0.5956993044924329, -0.8032075314806453}, // k=218 + {0.6152315905806270, -0.7883464276266061}, // k=219 + {0.6343932841636456, -0.7730104533627369}, // k=220 + {0.6531728429537768, -0.7572088465064846}, // k=221 + {0.6715589548470183, -0.7409511253549591}, // k=222 + {0.6895405447370668, -0.7242470829514670}, // k=223 + {0.7071067811865474, -0.7071067811865477}, // k=224 + {0.7242470829514667, -0.6895405447370672}, // k=225 + {0.7409511253549589, -0.6715589548470187}, // k=226 + {0.7572088465064842, -0.6531728429537771}, // k=227 + {0.7730104533627367, -0.6343932841636459}, // k=228 + {0.7883464276266059, -0.6152315905806274}, // k=229 + {0.8032075314806451, -0.5956993044924332}, // k=230 + {0.8175848131515837, -0.5758081914178452}, // k=231 + {0.8314696123025452, -0.5555702330196022}, // k=232 + {0.8448535652497070, -0.5349976198870973}, // k=233 + {0.8577286100002720, -0.5141027441932219}, // k=234 + {0.8700869911087113, -0.4928981922297843}, // k=235 + {0.8819212643483548, -0.4713967368259979}, // k=236 + {0.8932243011955151, -0.4496113296546070}, // k=237 + {0.9039892931234431, -0.4275550934302825}, // k=238 + {0.9142097557035305, -0.4052413140049904}, // k=239 + {0.9238795325112865, -0.3826834323650904}, // k=240 + {0.9329927988347390, -0.3598950365349880}, // k=241 + {0.9415440651830208, -0.3368898533922200}, // k=242 + {0.9495281805930367, -0.3136817403988915}, // k=243 + {0.9569403357322088, -0.2902846772544625}, // k=244 + {0.9637760657954398, -0.2667127574748986}, // k=245 + {0.9700312531945440, -0.2429801799032642}, // k=246 + {0.9757021300385285, -0.2191012401568702}, // k=247 + {0.9807852804032303, -0.1950903220161287}, // k=248 + {0.9852776423889411, -0.1709618887603018}, // k=249 + {0.9891765099647809, -0.1467304744553624}, // k=250 + {0.9924795345987100, -0.1224106751992160}, // k=251 + {0.9951847266721969, -0.0980171403295605}, // k=252 + {0.9972904566786902, -0.0735645635996674}, // k=253 + {0.9987954562051724, -0.0490676743274181}, // k=254 + {0.9996988186962042, -0.0245412285229124}, // k=255 +}; +__constant__ double2 c_twiddle_fwd_512[512] = { + {1.0000000000000000, -0.0000000000000000}, // k=0 + {0.9999247018391445, -0.0122715382857199}, // k=1 + {0.9996988186962042, -0.0245412285229123}, // k=2 + {0.9993223845883495, -0.0368072229413588}, // k=3 + {0.9987954562051724, -0.0490676743274180}, // k=4 + {0.9981181129001492, -0.0613207363022086}, // k=5 + {0.9972904566786902, -0.0735645635996674}, // k=6 + {0.9963126121827780, -0.0857973123444399}, // k=7 + {0.9951847266721969, -0.0980171403295606}, // k=8 + {0.9939069700023561, -0.1102222072938831}, // k=9 + {0.9924795345987100, -0.1224106751992162}, // k=10 + {0.9909026354277800, -0.1345807085071262}, // k=11 + {0.9891765099647810, -0.1467304744553617}, // k=12 + {0.9873014181578584, -0.1588581433338614}, // k=13 + {0.9852776423889412, -0.1709618887603012}, // k=14 + {0.9831054874312163, -0.1830398879551410}, // k=15 + {0.9807852804032304, -0.1950903220161282}, // k=16 + {0.9783173707196277, -0.2071113761922186}, // k=17 + {0.9757021300385286, -0.2191012401568698}, // k=18 + {0.9729399522055602, -0.2310581082806711}, // k=19 + {0.9700312531945440, -0.2429801799032639}, // k=20 + {0.9669764710448521, -0.2548656596045146}, // k=21 + {0.9637760657954398, -0.2667127574748984}, // k=22 + {0.9604305194155658, -0.2785196893850531}, // k=23 + {0.9569403357322088, -0.2902846772544623}, // k=24 + {0.9533060403541939, -0.3020059493192281}, // k=25 + {0.9495281805930367, -0.3136817403988915}, // k=26 + {0.9456073253805213, -0.3253102921622629}, // k=27 + {0.9415440651830208, -0.3368898533922201}, // k=28 + {0.9373390119125750, -0.3484186802494346}, // k=29 + {0.9329927988347390, -0.3598950365349881}, // k=30 + {0.9285060804732156, -0.3713171939518375}, // k=31 + {0.9238795325112867, -0.3826834323650898}, // k=32 + {0.9191138516900578, -0.3939920400610481}, // k=33 + {0.9142097557035307, -0.4052413140049899}, // k=34 + {0.9091679830905224, -0.4164295600976372}, // k=35 + {0.9039892931234433, -0.4275550934302821}, // k=36 + {0.8986744656939538, -0.4386162385385277}, // k=37 + {0.8932243011955153, -0.4496113296546065}, // k=38 + {0.8876396204028539, -0.4605387109582400}, // k=39 + {0.8819212643483550, -0.4713967368259976}, // k=40 + {0.8760700941954066, -0.4821837720791227}, // k=41 + {0.8700869911087115, -0.4928981922297840}, // k=42 + {0.8639728561215868, -0.5035383837257176}, // k=43 + {0.8577286100002721, -0.5141027441932217}, // k=44 + {0.8513551931052652, -0.5245896826784689}, // k=45 + {0.8448535652497071, -0.5349976198870972}, // k=46 + {0.8382247055548381, -0.5453249884220465}, // k=47 + {0.8314696123025452, -0.5555702330196022}, // k=48 + {0.8245893027850253, -0.5657318107836131}, // k=49 + {0.8175848131515837, -0.5758081914178453}, // k=50 + {0.8104571982525948, -0.5857978574564389}, // k=51 + {0.8032075314806449, -0.5956993044924334}, // k=52 + {0.7958369046088836, -0.6055110414043255}, // k=53 + {0.7883464276266063, -0.6152315905806268}, // k=54 + {0.7807372285720945, -0.6248594881423863}, // k=55 + {0.7730104533627370, -0.6343932841636455}, // k=56 + {0.7651672656224590, -0.6438315428897914}, // k=57 + {0.7572088465064846, -0.6531728429537768}, // k=58 + {0.7491363945234594, -0.6624157775901718}, // k=59 + {0.7409511253549591, -0.6715589548470183}, // k=60 + {0.7326542716724128, -0.6806009977954530}, // k=61 + {0.7242470829514670, -0.6895405447370668}, // k=62 + {0.7157308252838186, -0.6983762494089729}, // k=63 + {0.7071067811865476, -0.7071067811865475}, // k=64 + {0.6983762494089729, -0.7157308252838186}, // k=65 + {0.6895405447370669, -0.7242470829514669}, // k=66 + {0.6806009977954531, -0.7326542716724128}, // k=67 + {0.6715589548470183, -0.7409511253549591}, // k=68 + {0.6624157775901718, -0.7491363945234593}, // k=69 + {0.6531728429537768, -0.7572088465064845}, // k=70 + {0.6438315428897915, -0.7651672656224590}, // k=71 + {0.6343932841636455, -0.7730104533627370}, // k=72 + {0.6248594881423865, -0.7807372285720944}, // k=73 + {0.6152315905806268, -0.7883464276266062}, // k=74 + {0.6055110414043255, -0.7958369046088835}, // k=75 + {0.5956993044924335, -0.8032075314806448}, // k=76 + {0.5857978574564389, -0.8104571982525948}, // k=77 + {0.5758081914178453, -0.8175848131515837}, // k=78 + {0.5657318107836132, -0.8245893027850253}, // k=79 + {0.5555702330196023, -0.8314696123025452}, // k=80 + {0.5453249884220465, -0.8382247055548380}, // k=81 + {0.5349976198870973, -0.8448535652497070}, // k=82 + {0.5245896826784688, -0.8513551931052652}, // k=83 + {0.5141027441932217, -0.8577286100002721}, // k=84 + {0.5035383837257176, -0.8639728561215867}, // k=85 + {0.4928981922297841, -0.8700869911087113}, // k=86 + {0.4821837720791228, -0.8760700941954066}, // k=87 + {0.4713967368259978, -0.8819212643483549}, // k=88 + {0.4605387109582400, -0.8876396204028539}, // k=89 + {0.4496113296546066, -0.8932243011955153}, // k=90 + {0.4386162385385277, -0.8986744656939538}, // k=91 + {0.4275550934302822, -0.9039892931234433}, // k=92 + {0.4164295600976373, -0.9091679830905223}, // k=93 + {0.4052413140049899, -0.9142097557035307}, // k=94 + {0.3939920400610481, -0.9191138516900578}, // k=95 + {0.3826834323650898, -0.9238795325112867}, // k=96 + {0.3713171939518376, -0.9285060804732155}, // k=97 + {0.3598950365349883, -0.9329927988347388}, // k=98 + {0.3484186802494345, -0.9373390119125750}, // k=99 + {0.3368898533922201, -0.9415440651830208}, // k=100 + {0.3253102921622630, -0.9456073253805213}, // k=101 + {0.3136817403988916, -0.9495281805930367}, // k=102 + {0.3020059493192282, -0.9533060403541938}, // k=103 + {0.2902846772544623, -0.9569403357322089}, // k=104 + {0.2785196893850531, -0.9604305194155658}, // k=105 + {0.2667127574748984, -0.9637760657954398}, // k=106 + {0.2548656596045146, -0.9669764710448521}, // k=107 + {0.2429801799032640, -0.9700312531945440}, // k=108 + {0.2310581082806713, -0.9729399522055601}, // k=109 + {0.2191012401568698, -0.9757021300385286}, // k=110 + {0.2071113761922186, -0.9783173707196277}, // k=111 + {0.1950903220161283, -0.9807852804032304}, // k=112 + {0.1830398879551411, -0.9831054874312163}, // k=113 + {0.1709618887603014, -0.9852776423889412}, // k=114 + {0.1588581433338614, -0.9873014181578584}, // k=115 + {0.1467304744553617, -0.9891765099647810}, // k=116 + {0.1345807085071262, -0.9909026354277800}, // k=117 + {0.1224106751992163, -0.9924795345987100}, // k=118 + {0.1102222072938832, -0.9939069700023561}, // k=119 + {0.0980171403295608, -0.9951847266721968}, // k=120 + {0.0857973123444399, -0.9963126121827780}, // k=121 + {0.0735645635996675, -0.9972904566786902}, // k=122 + {0.0613207363022086, -0.9981181129001492}, // k=123 + {0.0490676743274181, -0.9987954562051724}, // k=124 + {0.0368072229413590, -0.9993223845883495}, // k=125 + {0.0245412285229123, -0.9996988186962042}, // k=126 + {0.0122715382857199, -0.9999247018391445}, // k=127 + {0.0000000000000001, -1.0000000000000000}, // k=128 + {-0.0122715382857198, -0.9999247018391445}, // k=129 + {-0.0245412285229121, -0.9996988186962042}, // k=130 + {-0.0368072229413589, -0.9993223845883495}, // k=131 + {-0.0490676743274180, -0.9987954562051724}, // k=132 + {-0.0613207363022085, -0.9981181129001492}, // k=133 + {-0.0735645635996673, -0.9972904566786902}, // k=134 + {-0.0857973123444398, -0.9963126121827780}, // k=135 + {-0.0980171403295606, -0.9951847266721969}, // k=136 + {-0.1102222072938831, -0.9939069700023561}, // k=137 + {-0.1224106751992162, -0.9924795345987100}, // k=138 + {-0.1345807085071261, -0.9909026354277800}, // k=139 + {-0.1467304744553616, -0.9891765099647810}, // k=140 + {-0.1588581433338613, -0.9873014181578584}, // k=141 + {-0.1709618887603012, -0.9852776423889412}, // k=142 + {-0.1830398879551409, -0.9831054874312163}, // k=143 + {-0.1950903220161282, -0.9807852804032304}, // k=144 + {-0.2071113761922184, -0.9783173707196277}, // k=145 + {-0.2191012401568697, -0.9757021300385286}, // k=146 + {-0.2310581082806711, -0.9729399522055602}, // k=147 + {-0.2429801799032639, -0.9700312531945440}, // k=148 + {-0.2548656596045145, -0.9669764710448521}, // k=149 + {-0.2667127574748983, -0.9637760657954398}, // k=150 + {-0.2785196893850529, -0.9604305194155659}, // k=151 + {-0.2902846772544622, -0.9569403357322089}, // k=152 + {-0.3020059493192281, -0.9533060403541939}, // k=153 + {-0.3136817403988914, -0.9495281805930367}, // k=154 + {-0.3253102921622629, -0.9456073253805214}, // k=155 + {-0.3368898533922199, -0.9415440651830208}, // k=156 + {-0.3484186802494344, -0.9373390119125750}, // k=157 + {-0.3598950365349882, -0.9329927988347388}, // k=158 + {-0.3713171939518375, -0.9285060804732156}, // k=159 + {-0.3826834323650897, -0.9238795325112867}, // k=160 + {-0.3939920400610480, -0.9191138516900578}, // k=161 + {-0.4052413140049897, -0.9142097557035307}, // k=162 + {-0.4164295600976370, -0.9091679830905225}, // k=163 + {-0.4275550934302819, -0.9039892931234434}, // k=164 + {-0.4386162385385274, -0.8986744656939539}, // k=165 + {-0.4496113296546067, -0.8932243011955152}, // k=166 + {-0.4605387109582401, -0.8876396204028539}, // k=167 + {-0.4713967368259977, -0.8819212643483550}, // k=168 + {-0.4821837720791227, -0.8760700941954066}, // k=169 + {-0.4928981922297840, -0.8700869911087115}, // k=170 + {-0.5035383837257175, -0.8639728561215868}, // k=171 + {-0.5141027441932217, -0.8577286100002721}, // k=172 + {-0.5245896826784687, -0.8513551931052652}, // k=173 + {-0.5349976198870970, -0.8448535652497072}, // k=174 + {-0.5453249884220462, -0.8382247055548382}, // k=175 + {-0.5555702330196020, -0.8314696123025455}, // k=176 + {-0.5657318107836132, -0.8245893027850252}, // k=177 + {-0.5758081914178453, -0.8175848131515837}, // k=178 + {-0.5857978574564389, -0.8104571982525948}, // k=179 + {-0.5956993044924334, -0.8032075314806449}, // k=180 + {-0.6055110414043254, -0.7958369046088836}, // k=181 + {-0.6152315905806267, -0.7883464276266063}, // k=182 + {-0.6248594881423862, -0.7807372285720946}, // k=183 + {-0.6343932841636454, -0.7730104533627371}, // k=184 + {-0.6438315428897913, -0.7651672656224591}, // k=185 + {-0.6531728429537765, -0.7572088465064847}, // k=186 + {-0.6624157775901719, -0.7491363945234593}, // k=187 + {-0.6715589548470184, -0.7409511253549590}, // k=188 + {-0.6806009977954530, -0.7326542716724128}, // k=189 + {-0.6895405447370669, -0.7242470829514669}, // k=190 + {-0.6983762494089728, -0.7157308252838187}, // k=191 + {-0.7071067811865475, -0.7071067811865476}, // k=192 + {-0.7157308252838186, -0.6983762494089729}, // k=193 + {-0.7242470829514668, -0.6895405447370671}, // k=194 + {-0.7326542716724127, -0.6806009977954532}, // k=195 + {-0.7409511253549589, -0.6715589548470186}, // k=196 + {-0.7491363945234591, -0.6624157775901720}, // k=197 + {-0.7572088465064846, -0.6531728429537766}, // k=198 + {-0.7651672656224590, -0.6438315428897914}, // k=199 + {-0.7730104533627370, -0.6343932841636455}, // k=200 + {-0.7807372285720945, -0.6248594881423863}, // k=201 + {-0.7883464276266062, -0.6152315905806269}, // k=202 + {-0.7958369046088835, -0.6055110414043257}, // k=203 + {-0.8032075314806448, -0.5956993044924335}, // k=204 + {-0.8104571982525947, -0.5857978574564390}, // k=205 + {-0.8175848131515836, -0.5758081914178454}, // k=206 + {-0.8245893027850251, -0.5657318107836135}, // k=207 + {-0.8314696123025453, -0.5555702330196022}, // k=208 + {-0.8382247055548381, -0.5453249884220464}, // k=209 + {-0.8448535652497071, -0.5349976198870972}, // k=210 + {-0.8513551931052652, -0.5245896826784689}, // k=211 + {-0.8577286100002720, -0.5141027441932218}, // k=212 + {-0.8639728561215867, -0.5035383837257177}, // k=213 + {-0.8700869911087113, -0.4928981922297841}, // k=214 + {-0.8760700941954065, -0.4821837720791229}, // k=215 + {-0.8819212643483549, -0.4713967368259979}, // k=216 + {-0.8876396204028538, -0.4605387109582402}, // k=217 + {-0.8932243011955152, -0.4496113296546069}, // k=218 + {-0.8986744656939539, -0.4386162385385275}, // k=219 + {-0.9039892931234433, -0.4275550934302820}, // k=220 + {-0.9091679830905224, -0.4164295600976372}, // k=221 + {-0.9142097557035307, -0.4052413140049899}, // k=222 + {-0.9191138516900578, -0.3939920400610482}, // k=223 + {-0.9238795325112867, -0.3826834323650899}, // k=224 + {-0.9285060804732155, -0.3713171939518377}, // k=225 + {-0.9329927988347388, -0.3598950365349883}, // k=226 + {-0.9373390119125748, -0.3484186802494348}, // k=227 + {-0.9415440651830207, -0.3368898533922203}, // k=228 + {-0.9456073253805212, -0.3253102921622633}, // k=229 + {-0.9495281805930367, -0.3136817403988914}, // k=230 + {-0.9533060403541939, -0.3020059493192280}, // k=231 + {-0.9569403357322088, -0.2902846772544624}, // k=232 + {-0.9604305194155658, -0.2785196893850532}, // k=233 + {-0.9637760657954398, -0.2667127574748985}, // k=234 + {-0.9669764710448521, -0.2548656596045147}, // k=235 + {-0.9700312531945440, -0.2429801799032641}, // k=236 + {-0.9729399522055601, -0.2310581082806713}, // k=237 + {-0.9757021300385285, -0.2191012401568700}, // k=238 + {-0.9783173707196275, -0.2071113761922188}, // k=239 + {-0.9807852804032304, -0.1950903220161286}, // k=240 + {-0.9831054874312163, -0.1830398879551409}, // k=241 + {-0.9852776423889412, -0.1709618887603012}, // k=242 + {-0.9873014181578584, -0.1588581433338615}, // k=243 + {-0.9891765099647810, -0.1467304744553618}, // k=244 + {-0.9909026354277800, -0.1345807085071263}, // k=245 + {-0.9924795345987100, -0.1224106751992163}, // k=246 + {-0.9939069700023561, -0.1102222072938832}, // k=247 + {-0.9951847266721968, -0.0980171403295608}, // k=248 + {-0.9963126121827780, -0.0857973123444402}, // k=249 + {-0.9972904566786902, -0.0735645635996677}, // k=250 + {-0.9981181129001492, -0.0613207363022085}, // k=251 + {-0.9987954562051724, -0.0490676743274180}, // k=252 + {-0.9993223845883495, -0.0368072229413588}, // k=253 + {-0.9996988186962042, -0.0245412285229123}, // k=254 + {-0.9999247018391445, -0.0122715382857200}, // k=255 + {-1.0000000000000000, -0.0000000000000001}, // k=256 + {-0.9999247018391445, 0.0122715382857198}, // k=257 + {-0.9996988186962042, 0.0245412285229121}, // k=258 + {-0.9993223845883495, 0.0368072229413586}, // k=259 + {-0.9987954562051724, 0.0490676743274177}, // k=260 + {-0.9981181129001492, 0.0613207363022082}, // k=261 + {-0.9972904566786902, 0.0735645635996675}, // k=262 + {-0.9963126121827780, 0.0857973123444399}, // k=263 + {-0.9951847266721969, 0.0980171403295606}, // k=264 + {-0.9939069700023561, 0.1102222072938830}, // k=265 + {-0.9924795345987100, 0.1224106751992161}, // k=266 + {-0.9909026354277800, 0.1345807085071261}, // k=267 + {-0.9891765099647810, 0.1467304744553616}, // k=268 + {-0.9873014181578584, 0.1588581433338612}, // k=269 + {-0.9852776423889413, 0.1709618887603010}, // k=270 + {-0.9831054874312164, 0.1830398879551406}, // k=271 + {-0.9807852804032304, 0.1950903220161284}, // k=272 + {-0.9783173707196277, 0.2071113761922186}, // k=273 + {-0.9757021300385286, 0.2191012401568698}, // k=274 + {-0.9729399522055602, 0.2310581082806711}, // k=275 + {-0.9700312531945440, 0.2429801799032638}, // k=276 + {-0.9669764710448522, 0.2548656596045145}, // k=277 + {-0.9637760657954400, 0.2667127574748983}, // k=278 + {-0.9604305194155659, 0.2785196893850529}, // k=279 + {-0.9569403357322089, 0.2902846772544621}, // k=280 + {-0.9533060403541940, 0.3020059493192278}, // k=281 + {-0.9495281805930368, 0.3136817403988912}, // k=282 + {-0.9456073253805213, 0.3253102921622630}, // k=283 + {-0.9415440651830208, 0.3368898533922201}, // k=284 + {-0.9373390119125750, 0.3484186802494346}, // k=285 + {-0.9329927988347390, 0.3598950365349881}, // k=286 + {-0.9285060804732156, 0.3713171939518374}, // k=287 + {-0.9238795325112868, 0.3826834323650897}, // k=288 + {-0.9191138516900578, 0.3939920400610479}, // k=289 + {-0.9142097557035307, 0.4052413140049897}, // k=290 + {-0.9091679830905225, 0.4164295600976369}, // k=291 + {-0.9039892931234434, 0.4275550934302818}, // k=292 + {-0.8986744656939540, 0.4386162385385273}, // k=293 + {-0.8932243011955153, 0.4496113296546067}, // k=294 + {-0.8876396204028539, 0.4605387109582401}, // k=295 + {-0.8819212643483550, 0.4713967368259976}, // k=296 + {-0.8760700941954066, 0.4821837720791227}, // k=297 + {-0.8700869911087115, 0.4928981922297839}, // k=298 + {-0.8639728561215868, 0.5035383837257175}, // k=299 + {-0.8577286100002721, 0.5141027441932216}, // k=300 + {-0.8513551931052653, 0.5245896826784687}, // k=301 + {-0.8448535652497072, 0.5349976198870969}, // k=302 + {-0.8382247055548382, 0.5453249884220461}, // k=303 + {-0.8314696123025455, 0.5555702330196020}, // k=304 + {-0.8245893027850253, 0.5657318107836132}, // k=305 + {-0.8175848131515837, 0.5758081914178453}, // k=306 + {-0.8104571982525948, 0.5857978574564389}, // k=307 + {-0.8032075314806449, 0.5956993044924332}, // k=308 + {-0.7958369046088836, 0.6055110414043254}, // k=309 + {-0.7883464276266063, 0.6152315905806267}, // k=310 + {-0.7807372285720946, 0.6248594881423862}, // k=311 + {-0.7730104533627371, 0.6343932841636453}, // k=312 + {-0.7651672656224591, 0.6438315428897913}, // k=313 + {-0.7572088465064848, 0.6531728429537765}, // k=314 + {-0.7491363945234593, 0.6624157775901718}, // k=315 + {-0.7409511253549591, 0.6715589548470184}, // k=316 + {-0.7326542716724128, 0.6806009977954530}, // k=317 + {-0.7242470829514670, 0.6895405447370668}, // k=318 + {-0.7157308252838187, 0.6983762494089728}, // k=319 + {-0.7071067811865477, 0.7071067811865475}, // k=320 + {-0.6983762494089730, 0.7157308252838185}, // k=321 + {-0.6895405447370671, 0.7242470829514668}, // k=322 + {-0.6806009977954532, 0.7326542716724126}, // k=323 + {-0.6715589548470187, 0.7409511253549589}, // k=324 + {-0.6624157775901720, 0.7491363945234590}, // k=325 + {-0.6531728429537771, 0.7572088465064842}, // k=326 + {-0.6438315428897915, 0.7651672656224590}, // k=327 + {-0.6343932841636459, 0.7730104533627367}, // k=328 + {-0.6248594881423865, 0.7807372285720944}, // k=329 + {-0.6152315905806273, 0.7883464276266059}, // k=330 + {-0.6055110414043257, 0.7958369046088835}, // k=331 + {-0.5956993044924331, 0.8032075314806451}, // k=332 + {-0.5857978574564391, 0.8104571982525947}, // k=333 + {-0.5758081914178452, 0.8175848131515838}, // k=334 + {-0.5657318107836135, 0.8245893027850251}, // k=335 + {-0.5555702330196022, 0.8314696123025452}, // k=336 + {-0.5453249884220468, 0.8382247055548379}, // k=337 + {-0.5349976198870973, 0.8448535652497070}, // k=338 + {-0.5245896826784694, 0.8513551931052649}, // k=339 + {-0.5141027441932218, 0.8577286100002720}, // k=340 + {-0.5035383837257180, 0.8639728561215865}, // k=341 + {-0.4928981922297842, 0.8700869911087113}, // k=342 + {-0.4821837720791226, 0.8760700941954067}, // k=343 + {-0.4713967368259979, 0.8819212643483549}, // k=344 + {-0.4605387109582399, 0.8876396204028540}, // k=345 + {-0.4496113296546069, 0.8932243011955152}, // k=346 + {-0.4386162385385276, 0.8986744656939538}, // k=347 + {-0.4275550934302825, 0.9039892931234431}, // k=348 + {-0.4164295600976372, 0.9091679830905224}, // k=349 + {-0.4052413140049904, 0.9142097557035305}, // k=350 + {-0.3939920400610482, 0.9191138516900577}, // k=351 + {-0.3826834323650903, 0.9238795325112865}, // k=352 + {-0.3713171939518378, 0.9285060804732155}, // k=353 + {-0.3598950365349879, 0.9329927988347390}, // k=354 + {-0.3484186802494348, 0.9373390119125748}, // k=355 + {-0.3368898533922199, 0.9415440651830208}, // k=356 + {-0.3253102921622633, 0.9456073253805212}, // k=357 + {-0.3136817403988915, 0.9495281805930367}, // k=358 + {-0.3020059493192285, 0.9533060403541938}, // k=359 + {-0.2902846772544624, 0.9569403357322088}, // k=360 + {-0.2785196893850536, 0.9604305194155657}, // k=361 + {-0.2667127574748985, 0.9637760657954398}, // k=362 + {-0.2548656596045143, 0.9669764710448522}, // k=363 + {-0.2429801799032641, 0.9700312531945440}, // k=364 + {-0.2310581082806709, 0.9729399522055602}, // k=365 + {-0.2191012401568701, 0.9757021300385285}, // k=366 + {-0.2071113761922185, 0.9783173707196277}, // k=367 + {-0.1950903220161287, 0.9807852804032303}, // k=368 + {-0.1830398879551410, 0.9831054874312163}, // k=369 + {-0.1709618887603017, 0.9852776423889411}, // k=370 + {-0.1588581433338615, 0.9873014181578583}, // k=371 + {-0.1467304744553623, 0.9891765099647809}, // k=372 + {-0.1345807085071264, 0.9909026354277800}, // k=373 + {-0.1224106751992160, 0.9924795345987101}, // k=374 + {-0.1102222072938833, 0.9939069700023561}, // k=375 + {-0.0980171403295605, 0.9951847266721969}, // k=376 + {-0.0857973123444402, 0.9963126121827780}, // k=377 + {-0.0735645635996674, 0.9972904566786902}, // k=378 + {-0.0613207363022090, 0.9981181129001492}, // k=379 + {-0.0490676743274180, 0.9987954562051724}, // k=380 + {-0.0368072229413593, 0.9993223845883494}, // k=381 + {-0.0245412285229124, 0.9996988186962042}, // k=382 + {-0.0122715382857205, 0.9999247018391445}, // k=383 + {-0.0000000000000002, 1.0000000000000000}, // k=384 + {0.0122715382857201, 0.9999247018391445}, // k=385 + {0.0245412285229120, 0.9996988186962042}, // k=386 + {0.0368072229413590, 0.9993223845883495}, // k=387 + {0.0490676743274177, 0.9987954562051724}, // k=388 + {0.0613207363022086, 0.9981181129001492}, // k=389 + {0.0735645635996670, 0.9972904566786902}, // k=390 + {0.0857973123444399, 0.9963126121827780}, // k=391 + {0.0980171403295601, 0.9951847266721969}, // k=392 + {0.1102222072938829, 0.9939069700023561}, // k=393 + {0.1224106751992156, 0.9924795345987101}, // k=394 + {0.1345807085071260, 0.9909026354277800}, // k=395 + {0.1467304744553619, 0.9891765099647809}, // k=396 + {0.1588581433338612, 0.9873014181578584}, // k=397 + {0.1709618887603013, 0.9852776423889412}, // k=398 + {0.1830398879551406, 0.9831054874312164}, // k=399 + {0.1950903220161283, 0.9807852804032304}, // k=400 + {0.2071113761922181, 0.9783173707196278}, // k=401 + {0.2191012401568697, 0.9757021300385286}, // k=402 + {0.2310581082806706, 0.9729399522055603}, // k=403 + {0.2429801799032638, 0.9700312531945440}, // k=404 + {0.2548656596045140, 0.9669764710448523}, // k=405 + {0.2667127574748982, 0.9637760657954400}, // k=406 + {0.2785196893850533, 0.9604305194155658}, // k=407 + {0.2902846772544621, 0.9569403357322089}, // k=408 + {0.3020059493192281, 0.9533060403541939}, // k=409 + {0.3136817403988911, 0.9495281805930368}, // k=410 + {0.3253102921622629, 0.9456073253805213}, // k=411 + {0.3368898533922196, 0.9415440651830209}, // k=412 + {0.3484186802494345, 0.9373390119125750}, // k=413 + {0.3598950365349876, 0.9329927988347391}, // k=414 + {0.3713171939518374, 0.9285060804732156}, // k=415 + {0.3826834323650900, 0.9238795325112866}, // k=416 + {0.3939920400610479, 0.9191138516900579}, // k=417 + {0.4052413140049900, 0.9142097557035306}, // k=418 + {0.4164295600976369, 0.9091679830905225}, // k=419 + {0.4275550934302821, 0.9039892931234433}, // k=420 + {0.4386162385385273, 0.8986744656939540}, // k=421 + {0.4496113296546066, 0.8932243011955153}, // k=422 + {0.4605387109582396, 0.8876396204028542}, // k=423 + {0.4713967368259976, 0.8819212643483550}, // k=424 + {0.4821837720791222, 0.8760700941954069}, // k=425 + {0.4928981922297839, 0.8700869911087115}, // k=426 + {0.5035383837257178, 0.8639728561215866}, // k=427 + {0.5141027441932216, 0.8577286100002722}, // k=428 + {0.5245896826784691, 0.8513551931052651}, // k=429 + {0.5349976198870969, 0.8448535652497072}, // k=430 + {0.5453249884220465, 0.8382247055548380}, // k=431 + {0.5555702330196018, 0.8314696123025455}, // k=432 + {0.5657318107836131, 0.8245893027850253}, // k=433 + {0.5758081914178449, 0.8175848131515840}, // k=434 + {0.5857978574564388, 0.8104571982525949}, // k=435 + {0.5956993044924329, 0.8032075314806453}, // k=436 + {0.6055110414043253, 0.7958369046088837}, // k=437 + {0.6152315905806270, 0.7883464276266061}, // k=438 + {0.6248594881423861, 0.7807372285720946}, // k=439 + {0.6343932841636456, 0.7730104533627369}, // k=440 + {0.6438315428897912, 0.7651672656224592}, // k=441 + {0.6531728429537768, 0.7572088465064846}, // k=442 + {0.6624157775901715, 0.7491363945234596}, // k=443 + {0.6715589548470183, 0.7409511253549591}, // k=444 + {0.6806009977954527, 0.7326542716724131}, // k=445 + {0.6895405447370668, 0.7242470829514670}, // k=446 + {0.6983762494089724, 0.7157308252838190}, // k=447 + {0.7071067811865474, 0.7071067811865477}, // k=448 + {0.7157308252838188, 0.6983762494089727}, // k=449 + {0.7242470829514667, 0.6895405447370672}, // k=450 + {0.7326542716724129, 0.6806009977954530}, // k=451 + {0.7409511253549589, 0.6715589548470187}, // k=452 + {0.7491363945234594, 0.6624157775901718}, // k=453 + {0.7572088465064842, 0.6531728429537771}, // k=454 + {0.7651672656224588, 0.6438315428897915}, // k=455 + {0.7730104533627367, 0.6343932841636459}, // k=456 + {0.7807372285720944, 0.6248594881423865}, // k=457 + {0.7883464276266059, 0.6152315905806274}, // k=458 + {0.7958369046088833, 0.6055110414043257}, // k=459 + {0.8032075314806451, 0.5956993044924332}, // k=460 + {0.8104571982525947, 0.5857978574564391}, // k=461 + {0.8175848131515837, 0.5758081914178452}, // k=462 + {0.8245893027850251, 0.5657318107836136}, // k=463 + {0.8314696123025452, 0.5555702330196022}, // k=464 + {0.8382247055548377, 0.5453249884220468}, // k=465 + {0.8448535652497070, 0.5349976198870973}, // k=466 + {0.8513551931052649, 0.5245896826784694}, // k=467 + {0.8577286100002720, 0.5141027441932219}, // k=468 + {0.8639728561215864, 0.5035383837257181}, // k=469 + {0.8700869911087113, 0.4928981922297843}, // k=470 + {0.8760700941954067, 0.4821837720791226}, // k=471 + {0.8819212643483548, 0.4713967368259979}, // k=472 + {0.8876396204028539, 0.4605387109582399}, // k=473 + {0.8932243011955151, 0.4496113296546070}, // k=474 + {0.8986744656939538, 0.4386162385385277}, // k=475 + {0.9039892931234431, 0.4275550934302825}, // k=476 + {0.9091679830905224, 0.4164295600976373}, // k=477 + {0.9142097557035305, 0.4052413140049904}, // k=478 + {0.9191138516900577, 0.3939920400610483}, // k=479 + {0.9238795325112865, 0.3826834323650904}, // k=480 + {0.9285060804732155, 0.3713171939518378}, // k=481 + {0.9329927988347390, 0.3598950365349880}, // k=482 + {0.9373390119125748, 0.3484186802494349}, // k=483 + {0.9415440651830208, 0.3368898533922200}, // k=484 + {0.9456073253805212, 0.3253102921622634}, // k=485 + {0.9495281805930367, 0.3136817403988915}, // k=486 + {0.9533060403541936, 0.3020059493192286}, // k=487 + {0.9569403357322088, 0.2902846772544625}, // k=488 + {0.9604305194155657, 0.2785196893850537}, // k=489 + {0.9637760657954398, 0.2667127574748986}, // k=490 + {0.9669764710448522, 0.2548656596045144}, // k=491 + {0.9700312531945440, 0.2429801799032642}, // k=492 + {0.9729399522055602, 0.2310581082806710}, // k=493 + {0.9757021300385285, 0.2191012401568702}, // k=494 + {0.9783173707196277, 0.2071113761922185}, // k=495 + {0.9807852804032303, 0.1950903220161287}, // k=496 + {0.9831054874312163, 0.1830398879551410}, // k=497 + {0.9852776423889411, 0.1709618887603018}, // k=498 + {0.9873014181578583, 0.1588581433338616}, // k=499 + {0.9891765099647809, 0.1467304744553624}, // k=500 + {0.9909026354277800, 0.1345807085071264}, // k=501 + {0.9924795345987100, 0.1224106751992160}, // k=502 + {0.9939069700023561, 0.1102222072938834}, // k=503 + {0.9951847266721969, 0.0980171403295605}, // k=504 + {0.9963126121827780, 0.0857973123444403}, // k=505 + {0.9972904566786902, 0.0735645635996674}, // k=506 + {0.9981181129001492, 0.0613207363022091}, // k=507 + {0.9987954562051724, 0.0490676743274181}, // k=508 + {0.9993223845883494, 0.0368072229413594}, // k=509 + {0.9996988186962042, 0.0245412285229124}, // k=510 + {0.9999247018391445, 0.0122715382857206}, // k=511 +}; + +__constant__ double2 c_twiddle_inv_512[512] = { + {1.0000000000000000, 0.0000000000000000}, // k=0 + {0.9999247018391445, 0.0122715382857199}, // k=1 + {0.9996988186962042, 0.0245412285229123}, // k=2 + {0.9993223845883495, 0.0368072229413588}, // k=3 + {0.9987954562051724, 0.0490676743274180}, // k=4 + {0.9981181129001492, 0.0613207363022086}, // k=5 + {0.9972904566786902, 0.0735645635996674}, // k=6 + {0.9963126121827780, 0.0857973123444399}, // k=7 + {0.9951847266721969, 0.0980171403295606}, // k=8 + {0.9939069700023561, 0.1102222072938831}, // k=9 + {0.9924795345987100, 0.1224106751992162}, // k=10 + {0.9909026354277800, 0.1345807085071262}, // k=11 + {0.9891765099647810, 0.1467304744553617}, // k=12 + {0.9873014181578584, 0.1588581433338614}, // k=13 + {0.9852776423889412, 0.1709618887603012}, // k=14 + {0.9831054874312163, 0.1830398879551410}, // k=15 + {0.9807852804032304, 0.1950903220161282}, // k=16 + {0.9783173707196277, 0.2071113761922186}, // k=17 + {0.9757021300385286, 0.2191012401568698}, // k=18 + {0.9729399522055602, 0.2310581082806711}, // k=19 + {0.9700312531945440, 0.2429801799032639}, // k=20 + {0.9669764710448521, 0.2548656596045146}, // k=21 + {0.9637760657954398, 0.2667127574748984}, // k=22 + {0.9604305194155658, 0.2785196893850531}, // k=23 + {0.9569403357322088, 0.2902846772544623}, // k=24 + {0.9533060403541939, 0.3020059493192281}, // k=25 + {0.9495281805930367, 0.3136817403988915}, // k=26 + {0.9456073253805213, 0.3253102921622629}, // k=27 + {0.9415440651830208, 0.3368898533922201}, // k=28 + {0.9373390119125750, 0.3484186802494346}, // k=29 + {0.9329927988347390, 0.3598950365349881}, // k=30 + {0.9285060804732156, 0.3713171939518375}, // k=31 + {0.9238795325112867, 0.3826834323650898}, // k=32 + {0.9191138516900578, 0.3939920400610481}, // k=33 + {0.9142097557035307, 0.4052413140049899}, // k=34 + {0.9091679830905224, 0.4164295600976372}, // k=35 + {0.9039892931234433, 0.4275550934302821}, // k=36 + {0.8986744656939538, 0.4386162385385277}, // k=37 + {0.8932243011955153, 0.4496113296546065}, // k=38 + {0.8876396204028539, 0.4605387109582400}, // k=39 + {0.8819212643483550, 0.4713967368259976}, // k=40 + {0.8760700941954066, 0.4821837720791227}, // k=41 + {0.8700869911087115, 0.4928981922297840}, // k=42 + {0.8639728561215868, 0.5035383837257176}, // k=43 + {0.8577286100002721, 0.5141027441932217}, // k=44 + {0.8513551931052652, 0.5245896826784689}, // k=45 + {0.8448535652497071, 0.5349976198870972}, // k=46 + {0.8382247055548381, 0.5453249884220465}, // k=47 + {0.8314696123025452, 0.5555702330196022}, // k=48 + {0.8245893027850253, 0.5657318107836131}, // k=49 + {0.8175848131515837, 0.5758081914178453}, // k=50 + {0.8104571982525948, 0.5857978574564389}, // k=51 + {0.8032075314806449, 0.5956993044924334}, // k=52 + {0.7958369046088836, 0.6055110414043255}, // k=53 + {0.7883464276266063, 0.6152315905806268}, // k=54 + {0.7807372285720945, 0.6248594881423863}, // k=55 + {0.7730104533627370, 0.6343932841636455}, // k=56 + {0.7651672656224590, 0.6438315428897914}, // k=57 + {0.7572088465064846, 0.6531728429537768}, // k=58 + {0.7491363945234594, 0.6624157775901718}, // k=59 + {0.7409511253549591, 0.6715589548470183}, // k=60 + {0.7326542716724128, 0.6806009977954530}, // k=61 + {0.7242470829514670, 0.6895405447370668}, // k=62 + {0.7157308252838186, 0.6983762494089729}, // k=63 + {0.7071067811865476, 0.7071067811865475}, // k=64 + {0.6983762494089729, 0.7157308252838186}, // k=65 + {0.6895405447370669, 0.7242470829514669}, // k=66 + {0.6806009977954531, 0.7326542716724128}, // k=67 + {0.6715589548470183, 0.7409511253549591}, // k=68 + {0.6624157775901718, 0.7491363945234593}, // k=69 + {0.6531728429537768, 0.7572088465064845}, // k=70 + {0.6438315428897915, 0.7651672656224590}, // k=71 + {0.6343932841636455, 0.7730104533627370}, // k=72 + {0.6248594881423865, 0.7807372285720944}, // k=73 + {0.6152315905806268, 0.7883464276266062}, // k=74 + {0.6055110414043255, 0.7958369046088835}, // k=75 + {0.5956993044924335, 0.8032075314806448}, // k=76 + {0.5857978574564389, 0.8104571982525948}, // k=77 + {0.5758081914178453, 0.8175848131515837}, // k=78 + {0.5657318107836132, 0.8245893027850253}, // k=79 + {0.5555702330196023, 0.8314696123025452}, // k=80 + {0.5453249884220465, 0.8382247055548380}, // k=81 + {0.5349976198870973, 0.8448535652497070}, // k=82 + {0.5245896826784688, 0.8513551931052652}, // k=83 + {0.5141027441932217, 0.8577286100002721}, // k=84 + {0.5035383837257176, 0.8639728561215867}, // k=85 + {0.4928981922297841, 0.8700869911087113}, // k=86 + {0.4821837720791228, 0.8760700941954066}, // k=87 + {0.4713967368259978, 0.8819212643483549}, // k=88 + {0.4605387109582400, 0.8876396204028539}, // k=89 + {0.4496113296546066, 0.8932243011955153}, // k=90 + {0.4386162385385277, 0.8986744656939538}, // k=91 + {0.4275550934302822, 0.9039892931234433}, // k=92 + {0.4164295600976373, 0.9091679830905223}, // k=93 + {0.4052413140049899, 0.9142097557035307}, // k=94 + {0.3939920400610481, 0.9191138516900578}, // k=95 + {0.3826834323650898, 0.9238795325112867}, // k=96 + {0.3713171939518376, 0.9285060804732155}, // k=97 + {0.3598950365349883, 0.9329927988347388}, // k=98 + {0.3484186802494345, 0.9373390119125750}, // k=99 + {0.3368898533922201, 0.9415440651830208}, // k=100 + {0.3253102921622630, 0.9456073253805213}, // k=101 + {0.3136817403988916, 0.9495281805930367}, // k=102 + {0.3020059493192282, 0.9533060403541938}, // k=103 + {0.2902846772544623, 0.9569403357322089}, // k=104 + {0.2785196893850531, 0.9604305194155658}, // k=105 + {0.2667127574748984, 0.9637760657954398}, // k=106 + {0.2548656596045146, 0.9669764710448521}, // k=107 + {0.2429801799032640, 0.9700312531945440}, // k=108 + {0.2310581082806713, 0.9729399522055601}, // k=109 + {0.2191012401568698, 0.9757021300385286}, // k=110 + {0.2071113761922186, 0.9783173707196277}, // k=111 + {0.1950903220161283, 0.9807852804032304}, // k=112 + {0.1830398879551411, 0.9831054874312163}, // k=113 + {0.1709618887603014, 0.9852776423889412}, // k=114 + {0.1588581433338614, 0.9873014181578584}, // k=115 + {0.1467304744553617, 0.9891765099647810}, // k=116 + {0.1345807085071262, 0.9909026354277800}, // k=117 + {0.1224106751992163, 0.9924795345987100}, // k=118 + {0.1102222072938832, 0.9939069700023561}, // k=119 + {0.0980171403295608, 0.9951847266721968}, // k=120 + {0.0857973123444399, 0.9963126121827780}, // k=121 + {0.0735645635996675, 0.9972904566786902}, // k=122 + {0.0613207363022086, 0.9981181129001492}, // k=123 + {0.0490676743274181, 0.9987954562051724}, // k=124 + {0.0368072229413590, 0.9993223845883495}, // k=125 + {0.0245412285229123, 0.9996988186962042}, // k=126 + {0.0122715382857199, 0.9999247018391445}, // k=127 + {0.0000000000000001, 1.0000000000000000}, // k=128 + {-0.0122715382857198, 0.9999247018391445}, // k=129 + {-0.0245412285229121, 0.9996988186962042}, // k=130 + {-0.0368072229413589, 0.9993223845883495}, // k=131 + {-0.0490676743274180, 0.9987954562051724}, // k=132 + {-0.0613207363022085, 0.9981181129001492}, // k=133 + {-0.0735645635996673, 0.9972904566786902}, // k=134 + {-0.0857973123444398, 0.9963126121827780}, // k=135 + {-0.0980171403295606, 0.9951847266721969}, // k=136 + {-0.1102222072938831, 0.9939069700023561}, // k=137 + {-0.1224106751992162, 0.9924795345987100}, // k=138 + {-0.1345807085071261, 0.9909026354277800}, // k=139 + {-0.1467304744553616, 0.9891765099647810}, // k=140 + {-0.1588581433338613, 0.9873014181578584}, // k=141 + {-0.1709618887603012, 0.9852776423889412}, // k=142 + {-0.1830398879551409, 0.9831054874312163}, // k=143 + {-0.1950903220161282, 0.9807852804032304}, // k=144 + {-0.2071113761922184, 0.9783173707196277}, // k=145 + {-0.2191012401568697, 0.9757021300385286}, // k=146 + {-0.2310581082806711, 0.9729399522055602}, // k=147 + {-0.2429801799032639, 0.9700312531945440}, // k=148 + {-0.2548656596045145, 0.9669764710448521}, // k=149 + {-0.2667127574748983, 0.9637760657954398}, // k=150 + {-0.2785196893850529, 0.9604305194155659}, // k=151 + {-0.2902846772544622, 0.9569403357322089}, // k=152 + {-0.3020059493192281, 0.9533060403541939}, // k=153 + {-0.3136817403988914, 0.9495281805930367}, // k=154 + {-0.3253102921622629, 0.9456073253805214}, // k=155 + {-0.3368898533922199, 0.9415440651830208}, // k=156 + {-0.3484186802494344, 0.9373390119125750}, // k=157 + {-0.3598950365349882, 0.9329927988347388}, // k=158 + {-0.3713171939518375, 0.9285060804732156}, // k=159 + {-0.3826834323650897, 0.9238795325112867}, // k=160 + {-0.3939920400610480, 0.9191138516900578}, // k=161 + {-0.4052413140049897, 0.9142097557035307}, // k=162 + {-0.4164295600976370, 0.9091679830905225}, // k=163 + {-0.4275550934302819, 0.9039892931234434}, // k=164 + {-0.4386162385385274, 0.8986744656939539}, // k=165 + {-0.4496113296546067, 0.8932243011955152}, // k=166 + {-0.4605387109582401, 0.8876396204028539}, // k=167 + {-0.4713967368259977, 0.8819212643483550}, // k=168 + {-0.4821837720791227, 0.8760700941954066}, // k=169 + {-0.4928981922297840, 0.8700869911087115}, // k=170 + {-0.5035383837257175, 0.8639728561215868}, // k=171 + {-0.5141027441932217, 0.8577286100002721}, // k=172 + {-0.5245896826784687, 0.8513551931052652}, // k=173 + {-0.5349976198870970, 0.8448535652497072}, // k=174 + {-0.5453249884220462, 0.8382247055548382}, // k=175 + {-0.5555702330196020, 0.8314696123025455}, // k=176 + {-0.5657318107836132, 0.8245893027850252}, // k=177 + {-0.5758081914178453, 0.8175848131515837}, // k=178 + {-0.5857978574564389, 0.8104571982525948}, // k=179 + {-0.5956993044924334, 0.8032075314806449}, // k=180 + {-0.6055110414043254, 0.7958369046088836}, // k=181 + {-0.6152315905806267, 0.7883464276266063}, // k=182 + {-0.6248594881423862, 0.7807372285720946}, // k=183 + {-0.6343932841636454, 0.7730104533627371}, // k=184 + {-0.6438315428897913, 0.7651672656224591}, // k=185 + {-0.6531728429537765, 0.7572088465064847}, // k=186 + {-0.6624157775901719, 0.7491363945234593}, // k=187 + {-0.6715589548470184, 0.7409511253549590}, // k=188 + {-0.6806009977954530, 0.7326542716724128}, // k=189 + {-0.6895405447370669, 0.7242470829514669}, // k=190 + {-0.6983762494089728, 0.7157308252838187}, // k=191 + {-0.7071067811865475, 0.7071067811865476}, // k=192 + {-0.7157308252838186, 0.6983762494089729}, // k=193 + {-0.7242470829514668, 0.6895405447370671}, // k=194 + {-0.7326542716724127, 0.6806009977954532}, // k=195 + {-0.7409511253549589, 0.6715589548470186}, // k=196 + {-0.7491363945234591, 0.6624157775901720}, // k=197 + {-0.7572088465064846, 0.6531728429537766}, // k=198 + {-0.7651672656224590, 0.6438315428897914}, // k=199 + {-0.7730104533627370, 0.6343932841636455}, // k=200 + {-0.7807372285720945, 0.6248594881423863}, // k=201 + {-0.7883464276266062, 0.6152315905806269}, // k=202 + {-0.7958369046088835, 0.6055110414043257}, // k=203 + {-0.8032075314806448, 0.5956993044924335}, // k=204 + {-0.8104571982525947, 0.5857978574564390}, // k=205 + {-0.8175848131515836, 0.5758081914178454}, // k=206 + {-0.8245893027850251, 0.5657318107836135}, // k=207 + {-0.8314696123025453, 0.5555702330196022}, // k=208 + {-0.8382247055548381, 0.5453249884220464}, // k=209 + {-0.8448535652497071, 0.5349976198870972}, // k=210 + {-0.8513551931052652, 0.5245896826784689}, // k=211 + {-0.8577286100002720, 0.5141027441932218}, // k=212 + {-0.8639728561215867, 0.5035383837257177}, // k=213 + {-0.8700869911087113, 0.4928981922297841}, // k=214 + {-0.8760700941954065, 0.4821837720791229}, // k=215 + {-0.8819212643483549, 0.4713967368259979}, // k=216 + {-0.8876396204028538, 0.4605387109582402}, // k=217 + {-0.8932243011955152, 0.4496113296546069}, // k=218 + {-0.8986744656939539, 0.4386162385385275}, // k=219 + {-0.9039892931234433, 0.4275550934302820}, // k=220 + {-0.9091679830905224, 0.4164295600976372}, // k=221 + {-0.9142097557035307, 0.4052413140049899}, // k=222 + {-0.9191138516900578, 0.3939920400610482}, // k=223 + {-0.9238795325112867, 0.3826834323650899}, // k=224 + {-0.9285060804732155, 0.3713171939518377}, // k=225 + {-0.9329927988347388, 0.3598950365349883}, // k=226 + {-0.9373390119125748, 0.3484186802494348}, // k=227 + {-0.9415440651830207, 0.3368898533922203}, // k=228 + {-0.9456073253805212, 0.3253102921622633}, // k=229 + {-0.9495281805930367, 0.3136817403988914}, // k=230 + {-0.9533060403541939, 0.3020059493192280}, // k=231 + {-0.9569403357322088, 0.2902846772544624}, // k=232 + {-0.9604305194155658, 0.2785196893850532}, // k=233 + {-0.9637760657954398, 0.2667127574748985}, // k=234 + {-0.9669764710448521, 0.2548656596045147}, // k=235 + {-0.9700312531945440, 0.2429801799032641}, // k=236 + {-0.9729399522055601, 0.2310581082806713}, // k=237 + {-0.9757021300385285, 0.2191012401568700}, // k=238 + {-0.9783173707196275, 0.2071113761922188}, // k=239 + {-0.9807852804032304, 0.1950903220161286}, // k=240 + {-0.9831054874312163, 0.1830398879551409}, // k=241 + {-0.9852776423889412, 0.1709618887603012}, // k=242 + {-0.9873014181578584, 0.1588581433338615}, // k=243 + {-0.9891765099647810, 0.1467304744553618}, // k=244 + {-0.9909026354277800, 0.1345807085071263}, // k=245 + {-0.9924795345987100, 0.1224106751992163}, // k=246 + {-0.9939069700023561, 0.1102222072938832}, // k=247 + {-0.9951847266721968, 0.0980171403295608}, // k=248 + {-0.9963126121827780, 0.0857973123444402}, // k=249 + {-0.9972904566786902, 0.0735645635996677}, // k=250 + {-0.9981181129001492, 0.0613207363022085}, // k=251 + {-0.9987954562051724, 0.0490676743274180}, // k=252 + {-0.9993223845883495, 0.0368072229413588}, // k=253 + {-0.9996988186962042, 0.0245412285229123}, // k=254 + {-0.9999247018391445, 0.0122715382857200}, // k=255 + {-1.0000000000000000, 0.0000000000000001}, // k=256 + {-0.9999247018391445, -0.0122715382857198}, // k=257 + {-0.9996988186962042, -0.0245412285229121}, // k=258 + {-0.9993223845883495, -0.0368072229413586}, // k=259 + {-0.9987954562051724, -0.0490676743274177}, // k=260 + {-0.9981181129001492, -0.0613207363022082}, // k=261 + {-0.9972904566786902, -0.0735645635996675}, // k=262 + {-0.9963126121827780, -0.0857973123444399}, // k=263 + {-0.9951847266721969, -0.0980171403295606}, // k=264 + {-0.9939069700023561, -0.1102222072938830}, // k=265 + {-0.9924795345987100, -0.1224106751992161}, // k=266 + {-0.9909026354277800, -0.1345807085071261}, // k=267 + {-0.9891765099647810, -0.1467304744553616}, // k=268 + {-0.9873014181578584, -0.1588581433338612}, // k=269 + {-0.9852776423889413, -0.1709618887603010}, // k=270 + {-0.9831054874312164, -0.1830398879551406}, // k=271 + {-0.9807852804032304, -0.1950903220161284}, // k=272 + {-0.9783173707196277, -0.2071113761922186}, // k=273 + {-0.9757021300385286, -0.2191012401568698}, // k=274 + {-0.9729399522055602, -0.2310581082806711}, // k=275 + {-0.9700312531945440, -0.2429801799032638}, // k=276 + {-0.9669764710448522, -0.2548656596045145}, // k=277 + {-0.9637760657954400, -0.2667127574748983}, // k=278 + {-0.9604305194155659, -0.2785196893850529}, // k=279 + {-0.9569403357322089, -0.2902846772544621}, // k=280 + {-0.9533060403541940, -0.3020059493192278}, // k=281 + {-0.9495281805930368, -0.3136817403988912}, // k=282 + {-0.9456073253805213, -0.3253102921622630}, // k=283 + {-0.9415440651830208, -0.3368898533922201}, // k=284 + {-0.9373390119125750, -0.3484186802494346}, // k=285 + {-0.9329927988347390, -0.3598950365349881}, // k=286 + {-0.9285060804732156, -0.3713171939518374}, // k=287 + {-0.9238795325112868, -0.3826834323650897}, // k=288 + {-0.9191138516900578, -0.3939920400610479}, // k=289 + {-0.9142097557035307, -0.4052413140049897}, // k=290 + {-0.9091679830905225, -0.4164295600976369}, // k=291 + {-0.9039892931234434, -0.4275550934302818}, // k=292 + {-0.8986744656939540, -0.4386162385385273}, // k=293 + {-0.8932243011955153, -0.4496113296546067}, // k=294 + {-0.8876396204028539, -0.4605387109582401}, // k=295 + {-0.8819212643483550, -0.4713967368259976}, // k=296 + {-0.8760700941954066, -0.4821837720791227}, // k=297 + {-0.8700869911087115, -0.4928981922297839}, // k=298 + {-0.8639728561215868, -0.5035383837257175}, // k=299 + {-0.8577286100002721, -0.5141027441932216}, // k=300 + {-0.8513551931052653, -0.5245896826784687}, // k=301 + {-0.8448535652497072, -0.5349976198870969}, // k=302 + {-0.8382247055548382, -0.5453249884220461}, // k=303 + {-0.8314696123025455, -0.5555702330196020}, // k=304 + {-0.8245893027850253, -0.5657318107836132}, // k=305 + {-0.8175848131515837, -0.5758081914178453}, // k=306 + {-0.8104571982525948, -0.5857978574564389}, // k=307 + {-0.8032075314806449, -0.5956993044924332}, // k=308 + {-0.7958369046088836, -0.6055110414043254}, // k=309 + {-0.7883464276266063, -0.6152315905806267}, // k=310 + {-0.7807372285720946, -0.6248594881423862}, // k=311 + {-0.7730104533627371, -0.6343932841636453}, // k=312 + {-0.7651672656224591, -0.6438315428897913}, // k=313 + {-0.7572088465064848, -0.6531728429537765}, // k=314 + {-0.7491363945234593, -0.6624157775901718}, // k=315 + {-0.7409511253549591, -0.6715589548470184}, // k=316 + {-0.7326542716724128, -0.6806009977954530}, // k=317 + {-0.7242470829514670, -0.6895405447370668}, // k=318 + {-0.7157308252838187, -0.6983762494089728}, // k=319 + {-0.7071067811865477, -0.7071067811865475}, // k=320 + {-0.6983762494089730, -0.7157308252838185}, // k=321 + {-0.6895405447370671, -0.7242470829514668}, // k=322 + {-0.6806009977954532, -0.7326542716724126}, // k=323 + {-0.6715589548470187, -0.7409511253549589}, // k=324 + {-0.6624157775901720, -0.7491363945234590}, // k=325 + {-0.6531728429537771, -0.7572088465064842}, // k=326 + {-0.6438315428897915, -0.7651672656224590}, // k=327 + {-0.6343932841636459, -0.7730104533627367}, // k=328 + {-0.6248594881423865, -0.7807372285720944}, // k=329 + {-0.6152315905806273, -0.7883464276266059}, // k=330 + {-0.6055110414043257, -0.7958369046088835}, // k=331 + {-0.5956993044924331, -0.8032075314806451}, // k=332 + {-0.5857978574564391, -0.8104571982525947}, // k=333 + {-0.5758081914178452, -0.8175848131515838}, // k=334 + {-0.5657318107836135, -0.8245893027850251}, // k=335 + {-0.5555702330196022, -0.8314696123025452}, // k=336 + {-0.5453249884220468, -0.8382247055548379}, // k=337 + {-0.5349976198870973, -0.8448535652497070}, // k=338 + {-0.5245896826784694, -0.8513551931052649}, // k=339 + {-0.5141027441932218, -0.8577286100002720}, // k=340 + {-0.5035383837257180, -0.8639728561215865}, // k=341 + {-0.4928981922297842, -0.8700869911087113}, // k=342 + {-0.4821837720791226, -0.8760700941954067}, // k=343 + {-0.4713967368259979, -0.8819212643483549}, // k=344 + {-0.4605387109582399, -0.8876396204028540}, // k=345 + {-0.4496113296546069, -0.8932243011955152}, // k=346 + {-0.4386162385385276, -0.8986744656939538}, // k=347 + {-0.4275550934302825, -0.9039892931234431}, // k=348 + {-0.4164295600976372, -0.9091679830905224}, // k=349 + {-0.4052413140049904, -0.9142097557035305}, // k=350 + {-0.3939920400610482, -0.9191138516900577}, // k=351 + {-0.3826834323650903, -0.9238795325112865}, // k=352 + {-0.3713171939518378, -0.9285060804732155}, // k=353 + {-0.3598950365349879, -0.9329927988347390}, // k=354 + {-0.3484186802494348, -0.9373390119125748}, // k=355 + {-0.3368898533922199, -0.9415440651830208}, // k=356 + {-0.3253102921622633, -0.9456073253805212}, // k=357 + {-0.3136817403988915, -0.9495281805930367}, // k=358 + {-0.3020059493192285, -0.9533060403541938}, // k=359 + {-0.2902846772544624, -0.9569403357322088}, // k=360 + {-0.2785196893850536, -0.9604305194155657}, // k=361 + {-0.2667127574748985, -0.9637760657954398}, // k=362 + {-0.2548656596045143, -0.9669764710448522}, // k=363 + {-0.2429801799032641, -0.9700312531945440}, // k=364 + {-0.2310581082806709, -0.9729399522055602}, // k=365 + {-0.2191012401568701, -0.9757021300385285}, // k=366 + {-0.2071113761922185, -0.9783173707196277}, // k=367 + {-0.1950903220161287, -0.9807852804032303}, // k=368 + {-0.1830398879551410, -0.9831054874312163}, // k=369 + {-0.1709618887603017, -0.9852776423889411}, // k=370 + {-0.1588581433338615, -0.9873014181578583}, // k=371 + {-0.1467304744553623, -0.9891765099647809}, // k=372 + {-0.1345807085071264, -0.9909026354277800}, // k=373 + {-0.1224106751992160, -0.9924795345987101}, // k=374 + {-0.1102222072938833, -0.9939069700023561}, // k=375 + {-0.0980171403295605, -0.9951847266721969}, // k=376 + {-0.0857973123444402, -0.9963126121827780}, // k=377 + {-0.0735645635996674, -0.9972904566786902}, // k=378 + {-0.0613207363022090, -0.9981181129001492}, // k=379 + {-0.0490676743274180, -0.9987954562051724}, // k=380 + {-0.0368072229413593, -0.9993223845883494}, // k=381 + {-0.0245412285229124, -0.9996988186962042}, // k=382 + {-0.0122715382857205, -0.9999247018391445}, // k=383 + {-0.0000000000000002, -1.0000000000000000}, // k=384 + {0.0122715382857201, -0.9999247018391445}, // k=385 + {0.0245412285229120, -0.9996988186962042}, // k=386 + {0.0368072229413590, -0.9993223845883495}, // k=387 + {0.0490676743274177, -0.9987954562051724}, // k=388 + {0.0613207363022086, -0.9981181129001492}, // k=389 + {0.0735645635996670, -0.9972904566786902}, // k=390 + {0.0857973123444399, -0.9963126121827780}, // k=391 + {0.0980171403295601, -0.9951847266721969}, // k=392 + {0.1102222072938829, -0.9939069700023561}, // k=393 + {0.1224106751992156, -0.9924795345987101}, // k=394 + {0.1345807085071260, -0.9909026354277800}, // k=395 + {0.1467304744553619, -0.9891765099647809}, // k=396 + {0.1588581433338612, -0.9873014181578584}, // k=397 + {0.1709618887603013, -0.9852776423889412}, // k=398 + {0.1830398879551406, -0.9831054874312164}, // k=399 + {0.1950903220161283, -0.9807852804032304}, // k=400 + {0.2071113761922181, -0.9783173707196278}, // k=401 + {0.2191012401568697, -0.9757021300385286}, // k=402 + {0.2310581082806706, -0.9729399522055603}, // k=403 + {0.2429801799032638, -0.9700312531945440}, // k=404 + {0.2548656596045140, -0.9669764710448523}, // k=405 + {0.2667127574748982, -0.9637760657954400}, // k=406 + {0.2785196893850533, -0.9604305194155658}, // k=407 + {0.2902846772544621, -0.9569403357322089}, // k=408 + {0.3020059493192281, -0.9533060403541939}, // k=409 + {0.3136817403988911, -0.9495281805930368}, // k=410 + {0.3253102921622629, -0.9456073253805213}, // k=411 + {0.3368898533922196, -0.9415440651830209}, // k=412 + {0.3484186802494345, -0.9373390119125750}, // k=413 + {0.3598950365349876, -0.9329927988347391}, // k=414 + {0.3713171939518374, -0.9285060804732156}, // k=415 + {0.3826834323650900, -0.9238795325112866}, // k=416 + {0.3939920400610479, -0.9191138516900579}, // k=417 + {0.4052413140049900, -0.9142097557035306}, // k=418 + {0.4164295600976369, -0.9091679830905225}, // k=419 + {0.4275550934302821, -0.9039892931234433}, // k=420 + {0.4386162385385273, -0.8986744656939540}, // k=421 + {0.4496113296546066, -0.8932243011955153}, // k=422 + {0.4605387109582396, -0.8876396204028542}, // k=423 + {0.4713967368259976, -0.8819212643483550}, // k=424 + {0.4821837720791222, -0.8760700941954069}, // k=425 + {0.4928981922297839, -0.8700869911087115}, // k=426 + {0.5035383837257178, -0.8639728561215866}, // k=427 + {0.5141027441932216, -0.8577286100002722}, // k=428 + {0.5245896826784691, -0.8513551931052651}, // k=429 + {0.5349976198870969, -0.8448535652497072}, // k=430 + {0.5453249884220465, -0.8382247055548380}, // k=431 + {0.5555702330196018, -0.8314696123025455}, // k=432 + {0.5657318107836131, -0.8245893027850253}, // k=433 + {0.5758081914178449, -0.8175848131515840}, // k=434 + {0.5857978574564388, -0.8104571982525949}, // k=435 + {0.5956993044924329, -0.8032075314806453}, // k=436 + {0.6055110414043253, -0.7958369046088837}, // k=437 + {0.6152315905806270, -0.7883464276266061}, // k=438 + {0.6248594881423861, -0.7807372285720946}, // k=439 + {0.6343932841636456, -0.7730104533627369}, // k=440 + {0.6438315428897912, -0.7651672656224592}, // k=441 + {0.6531728429537768, -0.7572088465064846}, // k=442 + {0.6624157775901715, -0.7491363945234596}, // k=443 + {0.6715589548470183, -0.7409511253549591}, // k=444 + {0.6806009977954527, -0.7326542716724131}, // k=445 + {0.6895405447370668, -0.7242470829514670}, // k=446 + {0.6983762494089724, -0.7157308252838190}, // k=447 + {0.7071067811865474, -0.7071067811865477}, // k=448 + {0.7157308252838188, -0.6983762494089727}, // k=449 + {0.7242470829514667, -0.6895405447370672}, // k=450 + {0.7326542716724129, -0.6806009977954530}, // k=451 + {0.7409511253549589, -0.6715589548470187}, // k=452 + {0.7491363945234594, -0.6624157775901718}, // k=453 + {0.7572088465064842, -0.6531728429537771}, // k=454 + {0.7651672656224588, -0.6438315428897915}, // k=455 + {0.7730104533627367, -0.6343932841636459}, // k=456 + {0.7807372285720944, -0.6248594881423865}, // k=457 + {0.7883464276266059, -0.6152315905806274}, // k=458 + {0.7958369046088833, -0.6055110414043257}, // k=459 + {0.8032075314806451, -0.5956993044924332}, // k=460 + {0.8104571982525947, -0.5857978574564391}, // k=461 + {0.8175848131515837, -0.5758081914178452}, // k=462 + {0.8245893027850251, -0.5657318107836136}, // k=463 + {0.8314696123025452, -0.5555702330196022}, // k=464 + {0.8382247055548377, -0.5453249884220468}, // k=465 + {0.8448535652497070, -0.5349976198870973}, // k=466 + {0.8513551931052649, -0.5245896826784694}, // k=467 + {0.8577286100002720, -0.5141027441932219}, // k=468 + {0.8639728561215864, -0.5035383837257181}, // k=469 + {0.8700869911087113, -0.4928981922297843}, // k=470 + {0.8760700941954067, -0.4821837720791226}, // k=471 + {0.8819212643483548, -0.4713967368259979}, // k=472 + {0.8876396204028539, -0.4605387109582399}, // k=473 + {0.8932243011955151, -0.4496113296546070}, // k=474 + {0.8986744656939538, -0.4386162385385277}, // k=475 + {0.9039892931234431, -0.4275550934302825}, // k=476 + {0.9091679830905224, -0.4164295600976373}, // k=477 + {0.9142097557035305, -0.4052413140049904}, // k=478 + {0.9191138516900577, -0.3939920400610483}, // k=479 + {0.9238795325112865, -0.3826834323650904}, // k=480 + {0.9285060804732155, -0.3713171939518378}, // k=481 + {0.9329927988347390, -0.3598950365349880}, // k=482 + {0.9373390119125748, -0.3484186802494349}, // k=483 + {0.9415440651830208, -0.3368898533922200}, // k=484 + {0.9456073253805212, -0.3253102921622634}, // k=485 + {0.9495281805930367, -0.3136817403988915}, // k=486 + {0.9533060403541936, -0.3020059493192286}, // k=487 + {0.9569403357322088, -0.2902846772544625}, // k=488 + {0.9604305194155657, -0.2785196893850537}, // k=489 + {0.9637760657954398, -0.2667127574748986}, // k=490 + {0.9669764710448522, -0.2548656596045144}, // k=491 + {0.9700312531945440, -0.2429801799032642}, // k=492 + {0.9729399522055602, -0.2310581082806710}, // k=493 + {0.9757021300385285, -0.2191012401568702}, // k=494 + {0.9783173707196277, -0.2071113761922185}, // k=495 + {0.9807852804032303, -0.1950903220161287}, // k=496 + {0.9831054874312163, -0.1830398879551410}, // k=497 + {0.9852776423889411, -0.1709618887603018}, // k=498 + {0.9873014181578583, -0.1588581433338616}, // k=499 + {0.9891765099647809, -0.1467304744553624}, // k=500 + {0.9909026354277800, -0.1345807085071264}, // k=501 + {0.9924795345987100, -0.1224106751992160}, // k=502 + {0.9939069700023561, -0.1102222072938834}, // k=503 + {0.9951847266721969, -0.0980171403295605}, // k=504 + {0.9963126121827780, -0.0857973123444403}, // k=505 + {0.9972904566786902, -0.0735645635996674}, // k=506 + {0.9981181129001492, -0.0613207363022091}, // k=507 + {0.9987954562051724, -0.0490676743274181}, // k=508 + {0.9993223845883494, -0.0368072229413594}, // k=509 + {0.9996988186962042, -0.0245412285229124}, // k=510 + {0.9999247018391445, -0.0122715382857206}, // k=511 +}; diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 382052047d..759c5aaa0a 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -17,6 +17,11 @@ #include +#if BOUT_HAS_CUDA +#include +#include +#endif + ShiftedMetric::ShiftedMetric(Mesh& m, CELL_LOC location_in, Field2D zShift_, BoutReal zlength_in, Options* opt) : ParallelTransform(m, opt), location(location_in), zShift(std::move(zShift_)), @@ -38,8 +43,8 @@ void ShiftedMetric::checkInputGrid() { "Should be 'shiftedmetric'."); } } // else: parallel_transform variable not found in grid input, indicates older input - // file or grid from options so must rely on the user having ensured the type is - // correct + // file or grid from options so must rely on the user having ensured the type is + // correct } void ShiftedMetric::outputVars(Options& output_options) { @@ -67,6 +72,7 @@ void ShiftedMetric::cachePhases() { toAlignedPhs = Tensor(mesh.LocalNx, mesh.LocalNy, nmodes); // To/From field aligned phases + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -105,6 +111,7 @@ void ShiftedMetric::cachePhases() { // Parallel slice phases -- note we don't shift in the boundaries/guards for (auto& slice : parallel_slice_phases) { + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { int ix = i.x(); @@ -166,6 +173,7 @@ Field3D ShiftedMetric::shiftZ(const Field3D& f, const Tensor& phs, Field3D result{emptyFrom(f).setDirectionY(y_direction_out)}; + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D(toString(region))) { shiftZ(&f(i, 0), &phs(i.x(), i.y(), 0), &result(i, 0)); } @@ -196,7 +204,8 @@ FieldPerp ShiftedMetric::shiftZ(const FieldPerp& f, const Tensor& phs, return result; } -void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out) const { +void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out, + int num_batches) const { #if BOUT_HAS_UMPIRE // TODO: This static keyword is a hotfix and should be removed in // future iterations. It is here because otherwise many allocations @@ -208,7 +217,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou #endif // Take forward FFT - rfft(in, mesh.LocalNz, &cmplx[0]); + rfft(in, mesh.LocalNz * num_batches, &cmplx[0]); // Following is an algorithm approach to write a = a*b where a and b are // vectors of dcomplex. @@ -222,6 +231,267 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } +/* NEW CODE */ +// Bit-reversal +__device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { + unsigned int result = 0; +#pragma unroll + for (unsigned int i = 0; i < log2n; i++) { + result = (result << 1) | (x & 1); + x >>= 1; + } + return result; +} + +// Block-level cooperative FFT +// Multiple threads cooperate on each FFT using shared memory +template +__global__ void +fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, + const double2** __restrict__ blocks_phs, const int Nz_runtime, + const int nmodes, const int batches, const int nblocks) { + + constexpr int LOG2_NZ = __builtin_ctz(NZ); + constexpr double INV_NZ = 1.0 / (double)NZ; + + // Shared memory for FFTS_PER_BLOCK FFTs + // Each FFT needs NZ complex values + __shared__ double2 shared_fft[FFTS_PER_BLOCK][NZ]; + + // Select twiddles based on size + const double2* twiddles; + if constexpr (NZ == 16) { + twiddles = c_twiddle_fwd_16; + } else if constexpr (NZ == 64) { + twiddles = c_twiddle_fwd_64; + } else if constexpr (NZ == 128) { + twiddles = c_twiddle_fwd_128; + } else if constexpr (NZ == 256) { + twiddles = c_twiddle_fwd_256; + } else if constexpr (NZ == 512) { + twiddles = c_twiddle_fwd_512; + } else { + static_assert(NZ == 16 || NZ == 64 || NZ == 128 || NZ == 256 || NZ == 512, + "Unsupported NZ"); + } + + // Each block processes FFTS_PER_BLOCK FFTs + const int fft_id_in_block = + threadIdx.y; // Which FFT this thread works on (0 to FFTS_PER_BLOCK-1) + const int global_fft_id = blockIdx.x * FFTS_PER_BLOCK + fft_id_in_block; + + if (global_fft_id >= nblocks * batches) + return; + + const int block = global_fft_id / batches; + const int batch = global_fft_id % batches; + + const double* __restrict__ in_line = in[block] + batch * NZ; + double* __restrict__ out_line = out[block] + batch * NZ; + const double2* __restrict__ phs = blocks_phs[block]; + + // Thread ID within the FFT computation + const int tid = threadIdx.x; + const int threads_per_fft = blockDim.x; // All threads in x-dimension work on same FFT + + // ===== LOAD INPUT WITH BIT-REVERSAL ===== + // Each thread loads some elements (strided) + for (int i = tid; i < NZ; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + shared_fft[fft_id_in_block][rev_i].x = in_line[i]; + shared_fft[fft_id_in_block][rev_i].y = 0.0; + } + __syncthreads(); + + // ===== FORWARD FFT: Cooley-Tukey DIT in Shared Memory ===== + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + // Each thread processes multiple butterflies + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + // Twiddle factor + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + // Load from shared memory + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + // Butterfly: t = W * bottom + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + // Write back + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // ===== APPLY PHASE SHIFT ===== + for (int k = tid; k < nmodes; k += threads_per_fft) { + const double2 ph = phs[batch * nmodes + k]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * ph.x - imag * ph.y; + shared_fft[fft_id_in_block][k].y = real * ph.y + imag * ph.x; + } + + for (int k = tid + nmodes; k < NZ; k += threads_per_fft) { + if (k >= nmodes) { + const int kk = NZ - k; + const double2 tmp = phs[batch * nmodes + kk]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; + shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; + } + } + __syncthreads(); + + // ===== INVERSE FFT: Conjugate, FFT, Conjugate ===== + // Conjugate input + for (int i = tid; i < NZ; i += threads_per_fft) { + shared_fft[fft_id_in_block][i].y = -shared_fft[fft_id_in_block][i].y; + } + __syncthreads(); + + // Bit-reverse for inverse + __shared__ double2 temp_fft[FFTS_PER_BLOCK][NZ]; + for (int i = tid; i < NZ; i += threads_per_fft) { + const unsigned int rev_i = bit_reverse(i, LOG2_NZ); + temp_fft[fft_id_in_block][rev_i] = shared_fft[fft_id_in_block][i]; + } + __syncthreads(); + + for (int i = tid; i < NZ; i += threads_per_fft) { + shared_fft[fft_id_in_block][i] = temp_fft[fft_id_in_block][i]; + } + __syncthreads(); + + // Forward FFT again (for inverse) + for (int stage = 0; stage < LOG2_NZ; ++stage) { + const int m = 1 << (stage + 1); + const int m_half = m >> 1; + + for (int k = tid; k < NZ / 2; k += threads_per_fft) { + const int butterfly_group = k / m_half; + const int j = k % m_half; + const int idx_top = butterfly_group * m + j; + const int idx_bot = idx_top + m_half; + + const int twiddle_k = (j * NZ) / m; + const double wr = twiddles[twiddle_k].x; + const double wi = twiddles[twiddle_k].y; + + const double top_r = shared_fft[fft_id_in_block][idx_top].x; + const double top_i = shared_fft[fft_id_in_block][idx_top].y; + const double bot_r = shared_fft[fft_id_in_block][idx_bot].x; + const double bot_i = shared_fft[fft_id_in_block][idx_bot].y; + + const double t_r = wr * bot_r - wi * bot_i; + const double t_i = wr * bot_i + wi * bot_r; + + shared_fft[fft_id_in_block][idx_top].x = top_r + t_r; + shared_fft[fft_id_in_block][idx_top].y = top_i + t_i; + shared_fft[fft_id_in_block][idx_bot].x = top_r - t_r; + shared_fft[fft_id_in_block][idx_bot].y = top_i - t_i; + } + __syncthreads(); + } + + // Store output (conjugate and normalize) + for (int i = tid; i < NZ; i += threads_per_fft) { + out_line[i] = shared_fft[fft_id_in_block][i].x * INV_NZ; + } +} + +// Launcher for block-level cooperative FFT +static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, + const double2** phs, int nblocks, int batches, + cudaStream_t stream = 0) { + int Nz = mesh.LocalNz; + int nmodes = Nz / 2 + 1; + + if ((Nz & (Nz - 1)) != 0) { + fprintf(stderr, "Error: Nz=%d must be power of 2\n", Nz); + return; + } + + const int total_ffts = nblocks * batches; + + if (Nz == 16) { + constexpr int FFTS_PER_BLOCK = 16; + constexpr int THREADS_PER_FFT = 16; // Use 64 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 16 x 16 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<16, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + } else if (Nz == 64) { + constexpr int FFTS_PER_BLOCK = 4; + constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 64 x 4 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<64, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 128) { + constexpr int FFTS_PER_BLOCK = 2; + constexpr int THREADS_PER_FFT = 128; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 128 x 2 = 256 threads + dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); + + fft_block_cooperative<128, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 256) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 256; + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 256 x 1 = 256 threads + dim3 grid(total_ffts); + + fft_block_cooperative<256, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + + } else if (Nz == 512) { + constexpr int FFTS_PER_BLOCK = 1; + constexpr int THREADS_PER_FFT = 512; // 512 threads per FFT + + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 512 x 1 = 512 threads + dim3 grid(total_ffts); + + fft_block_cooperative<512, FFTS_PER_BLOCK> + <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + } else { + fprintf(stderr, "Unsupported Nz=%d for block FFT\n", Nz); + throw std::runtime_error("Unsupported Nz for block FFT"); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error(std::string("Block FFT failed: ") + cudaGetErrorString(err)); + } +} + +/* END NEWER CODE */ + void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { // Cannot calculate parallel slices for field-aligned fields, so return without @@ -231,9 +501,51 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f.splitParallelSlices(); + auto& region = mesh.getRegion2D("RGN_NOY"); + + static size_t nblocks = region.getBlocks().size(); + if (nblocks != region.getBlocks().size()) { + throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); + } + static Array blocks_in(nblocks); + static Array blocks_out(nblocks); + static Array phs_in(nblocks); + for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); f_slice.allocate(); + +#if BOUT_HAS_CUDA + size_t block_idx = 0; + int num_batches = + region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; + + for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); + block < end; ++block) { + auto idx_s = block->first; + auto idx_e = block->second; + int inner_batches = idx_e.ind - idx_s.ind; + if (inner_batches != num_batches) { + throw BoutException( + "Non-uniform number of batches in ShiftedMetric::calcParallelSlices"); + } + const int ix = idx_s.x(); + const int iy = idx_s.y(); + const int iy_offset = iy + phase.y_offset; + + blocks_in[block_idx] = &f(ix, iy_offset, 0); + blocks_out[block_idx] = &f_slice(ix, iy_offset, 0); + phs_in[block_idx] = reinterpret_cast(&phase.phase_shift(ix, iy, 0)); + + block_idx++; + } + + shiftZ_block_fft(mesh, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, + num_batches, 0); + + cudaDeviceSynchronize(); +#else + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -241,6 +553,9 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } + //std::cout << "ShiftedMetric::shiftZ " << __FILE__ << " :" << __LINE__ + // << " count = " << count << " each size " << mesh.LocalNz << "\n"; +#endif } } @@ -257,6 +572,7 @@ ShiftedMetric::shiftZ(const Field3D& f, Matrix> f_fft(mesh.LocalNx, mesh.LocalNy); f_fft = Array(nmodes); + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -271,6 +587,7 @@ ShiftedMetric::shiftZ(const Field3D& f, current_result.allocate(); current_result.setLocation(f.getLocation()); + // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { // Deep copy the FFT'd field int ix = i.x(); From 40974a9d98859690646e43f9a8e4391a5d1e585e Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 08:33:29 -0700 Subject: [PATCH 21/29] Fixup: add twiddle header in cmake --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0a657fe94..b5beb75898 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,6 +177,7 @@ set(BOUT_SOURCES ./include/bout/sys/range.hxx ./include/bout/sys/timer.hxx ./include/bout/sys/type_name.hxx + ./include/bout/twiddle.hxx ./include/bout/sys/uncopyable.hxx ./include/bout/sys/uuid.h ./include/bout/sys/variant.hxx @@ -239,7 +240,7 @@ set(BOUT_SOURCES ./include/bout/invert/laplacexy2.hxx ./src/invert/laplacexy2/laplacexy2.cxx ./include/bout/invert/laplacexy2_hypre.hxx - ./src/invert/laplacexy2/laplacexy2_hypre.cxx + ./src/invert/laplacexy2/laplacexy2_hypre.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.cxx ./src/invert/laplacexz/impls/cyclic/laplacexz-cyclic.hxx ./src/invert/laplacexz/impls/petsc/laplacexz-petsc.cxx @@ -525,7 +526,7 @@ if (BOUT_ENABLE_WARNINGS) $<$,$,$>: -Wall -Wextra > > $<$: - /W4 > + /W4 > $<$:-Xcompiler=-Wall -Xcompiler=-Wextra > ) From ba5eabf34242f8ade482cd7b8486248a44e1f73d Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 08:33:55 -0700 Subject: [PATCH 22/29] Default to pinned memory for performance --- include/bout/array.hxx | 1 + 1 file changed, 1 insertion(+) diff --git a/include/bout/array.hxx b/include/bout/array.hxx index 2c42f15aad..4965aee880 100644 --- a/include/bout/array.hxx +++ b/include/bout/array.hxx @@ -67,6 +67,7 @@ struct ArrayData { auto& rm = umpire::ResourceManager::getInstance(); #if BOUT_HAS_CUDA auto allocator = rm.getAllocator(umpire::resource::Pinned); + //auto allocator = rm.getAllocator(umpire::resource::Unified); #else auto allocator = rm.getAllocator("HOST"); #endif From c079bb6f508efa85b8b73dcf0f5ee9ac5ce3b231 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 09:40:56 -0700 Subject: [PATCH 23/29] Fixup: remove unused twiddles --- include/bout/twiddle.hxx | 1033 +------------------------------------- 1 file changed, 6 insertions(+), 1027 deletions(-) diff --git a/include/bout/twiddle.hxx b/include/bout/twiddle.hxx index ae4f729b48..6da72dd8ff 100644 --- a/include/bout/twiddle.hxx +++ b/include/bout/twiddle.hxx @@ -1,4 +1,4 @@ -__constant__ double2 c_twiddle_fwd_16[16] = { +__constant__ double2 c_twiddle_16[16] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9238795325112867, -0.3826834323650898}, // k=1 {0.7071067811865476, -0.7071067811865475}, // k=2 @@ -17,25 +17,7 @@ __constant__ double2 c_twiddle_fwd_16[16] = { {0.9238795325112865, 0.3826834323650904}, // k=15 }; -__constant__ double2 c_twiddle_inv_16[16] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9238795325112867, 0.3826834323650898}, // k=1 - {0.7071067811865476, 0.7071067811865475}, // k=2 - {0.3826834323650898, 0.9238795325112867}, // k=3 - {0.0000000000000001, 1.0000000000000000}, // k=4 - {-0.3826834323650897, 0.9238795325112867}, // k=5 - {-0.7071067811865475, 0.7071067811865476}, // k=6 - {-0.9238795325112867, 0.3826834323650899}, // k=7 - {-1.0000000000000000, 0.0000000000000001}, // k=8 - {-0.9238795325112868, -0.3826834323650897}, // k=9 - {-0.7071067811865477, -0.7071067811865475}, // k=10 - {-0.3826834323650903, -0.9238795325112865}, // k=11 - {-0.0000000000000002, -1.0000000000000000}, // k=12 - {0.3826834323650900, -0.9238795325112866}, // k=13 - {0.7071067811865474, -0.7071067811865477}, // k=14 - {0.9238795325112865, -0.3826834323650904}, // k=15 -}; -__constant__ double2 c_twiddle_fwd_32[32] = { +__constant__ double2 c_twiddle_32[32] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9807852804032304, -0.1950903220161282}, // k=1 {0.9238795325112867, -0.3826834323650898}, // k=2 @@ -70,41 +52,7 @@ __constant__ double2 c_twiddle_fwd_32[32] = { {0.9807852804032303, 0.1950903220161287}, // k=31 }; -__constant__ double2 c_twiddle_inv_32[32] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9807852804032304, 0.1950903220161282}, // k=1 - {0.9238795325112867, 0.3826834323650898}, // k=2 - {0.8314696123025452, 0.5555702330196022}, // k=3 - {0.7071067811865476, 0.7071067811865475}, // k=4 - {0.5555702330196023, 0.8314696123025452}, // k=5 - {0.3826834323650898, 0.9238795325112867}, // k=6 - {0.1950903220161283, 0.9807852804032304}, // k=7 - {0.0000000000000001, 1.0000000000000000}, // k=8 - {-0.1950903220161282, 0.9807852804032304}, // k=9 - {-0.3826834323650897, 0.9238795325112867}, // k=10 - {-0.5555702330196020, 0.8314696123025455}, // k=11 - {-0.7071067811865475, 0.7071067811865476}, // k=12 - {-0.8314696123025453, 0.5555702330196022}, // k=13 - {-0.9238795325112867, 0.3826834323650899}, // k=14 - {-0.9807852804032304, 0.1950903220161286}, // k=15 - {-1.0000000000000000, 0.0000000000000001}, // k=16 - {-0.9807852804032304, -0.1950903220161284}, // k=17 - {-0.9238795325112868, -0.3826834323650897}, // k=18 - {-0.8314696123025455, -0.5555702330196020}, // k=19 - {-0.7071067811865477, -0.7071067811865475}, // k=20 - {-0.5555702330196022, -0.8314696123025452}, // k=21 - {-0.3826834323650903, -0.9238795325112865}, // k=22 - {-0.1950903220161287, -0.9807852804032303}, // k=23 - {-0.0000000000000002, -1.0000000000000000}, // k=24 - {0.1950903220161283, -0.9807852804032304}, // k=25 - {0.3826834323650900, -0.9238795325112866}, // k=26 - {0.5555702330196018, -0.8314696123025455}, // k=27 - {0.7071067811865474, -0.7071067811865477}, // k=28 - {0.8314696123025452, -0.5555702330196022}, // k=29 - {0.9238795325112865, -0.3826834323650904}, // k=30 - {0.9807852804032303, -0.1950903220161287}, // k=31 -}; -__constant__ double2 c_twiddle_fwd_64[64] = { +__constant__ double2 c_twiddle_64[64] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9951847266721969, -0.0980171403295606}, // k=1 {0.9807852804032304, -0.1950903220161282}, // k=2 @@ -171,73 +119,7 @@ __constant__ double2 c_twiddle_fwd_64[64] = { {0.9951847266721969, 0.0980171403295605}, // k=63 }; -__constant__ double2 c_twiddle_inv_64[64] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9951847266721969, 0.0980171403295606}, // k=1 - {0.9807852804032304, 0.1950903220161282}, // k=2 - {0.9569403357322088, 0.2902846772544623}, // k=3 - {0.9238795325112867, 0.3826834323650898}, // k=4 - {0.8819212643483550, 0.4713967368259976}, // k=5 - {0.8314696123025452, 0.5555702330196022}, // k=6 - {0.7730104533627370, 0.6343932841636455}, // k=7 - {0.7071067811865476, 0.7071067811865475}, // k=8 - {0.6343932841636455, 0.7730104533627370}, // k=9 - {0.5555702330196023, 0.8314696123025452}, // k=10 - {0.4713967368259978, 0.8819212643483549}, // k=11 - {0.3826834323650898, 0.9238795325112867}, // k=12 - {0.2902846772544623, 0.9569403357322089}, // k=13 - {0.1950903220161283, 0.9807852804032304}, // k=14 - {0.0980171403295608, 0.9951847266721968}, // k=15 - {0.0000000000000001, 1.0000000000000000}, // k=16 - {-0.0980171403295606, 0.9951847266721969}, // k=17 - {-0.1950903220161282, 0.9807852804032304}, // k=18 - {-0.2902846772544622, 0.9569403357322089}, // k=19 - {-0.3826834323650897, 0.9238795325112867}, // k=20 - {-0.4713967368259977, 0.8819212643483550}, // k=21 - {-0.5555702330196020, 0.8314696123025455}, // k=22 - {-0.6343932841636454, 0.7730104533627371}, // k=23 - {-0.7071067811865475, 0.7071067811865476}, // k=24 - {-0.7730104533627370, 0.6343932841636455}, // k=25 - {-0.8314696123025453, 0.5555702330196022}, // k=26 - {-0.8819212643483549, 0.4713967368259979}, // k=27 - {-0.9238795325112867, 0.3826834323650899}, // k=28 - {-0.9569403357322088, 0.2902846772544624}, // k=29 - {-0.9807852804032304, 0.1950903220161286}, // k=30 - {-0.9951847266721968, 0.0980171403295608}, // k=31 - {-1.0000000000000000, 0.0000000000000001}, // k=32 - {-0.9951847266721969, -0.0980171403295606}, // k=33 - {-0.9807852804032304, -0.1950903220161284}, // k=34 - {-0.9569403357322089, -0.2902846772544621}, // k=35 - {-0.9238795325112868, -0.3826834323650897}, // k=36 - {-0.8819212643483550, -0.4713967368259976}, // k=37 - {-0.8314696123025455, -0.5555702330196020}, // k=38 - {-0.7730104533627371, -0.6343932841636453}, // k=39 - {-0.7071067811865477, -0.7071067811865475}, // k=40 - {-0.6343932841636459, -0.7730104533627367}, // k=41 - {-0.5555702330196022, -0.8314696123025452}, // k=42 - {-0.4713967368259979, -0.8819212643483549}, // k=43 - {-0.3826834323650903, -0.9238795325112865}, // k=44 - {-0.2902846772544624, -0.9569403357322088}, // k=45 - {-0.1950903220161287, -0.9807852804032303}, // k=46 - {-0.0980171403295605, -0.9951847266721969}, // k=47 - {-0.0000000000000002, -1.0000000000000000}, // k=48 - {0.0980171403295601, -0.9951847266721969}, // k=49 - {0.1950903220161283, -0.9807852804032304}, // k=50 - {0.2902846772544621, -0.9569403357322089}, // k=51 - {0.3826834323650900, -0.9238795325112866}, // k=52 - {0.4713967368259976, -0.8819212643483550}, // k=53 - {0.5555702330196018, -0.8314696123025455}, // k=54 - {0.6343932841636456, -0.7730104533627369}, // k=55 - {0.7071067811865474, -0.7071067811865477}, // k=56 - {0.7730104533627367, -0.6343932841636459}, // k=57 - {0.8314696123025452, -0.5555702330196022}, // k=58 - {0.8819212643483548, -0.4713967368259979}, // k=59 - {0.9238795325112865, -0.3826834323650904}, // k=60 - {0.9569403357322088, -0.2902846772544625}, // k=61 - {0.9807852804032303, -0.1950903220161287}, // k=62 - {0.9951847266721969, -0.0980171403295605}, // k=63 -}; -__constant__ double2 c_twiddle_fwd_128[128] = { +__constant__ double2 c_twiddle_128[128] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9987954562051724, -0.0490676743274180}, // k=1 {0.9951847266721969, -0.0980171403295606}, // k=2 @@ -368,137 +250,7 @@ __constant__ double2 c_twiddle_fwd_128[128] = { {0.9987954562051724, 0.0490676743274181}, // k=127 }; -__constant__ double2 c_twiddle_inv_128[128] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9987954562051724, 0.0490676743274180}, // k=1 - {0.9951847266721969, 0.0980171403295606}, // k=2 - {0.9891765099647810, 0.1467304744553617}, // k=3 - {0.9807852804032304, 0.1950903220161282}, // k=4 - {0.9700312531945440, 0.2429801799032639}, // k=5 - {0.9569403357322088, 0.2902846772544623}, // k=6 - {0.9415440651830208, 0.3368898533922201}, // k=7 - {0.9238795325112867, 0.3826834323650898}, // k=8 - {0.9039892931234433, 0.4275550934302821}, // k=9 - {0.8819212643483550, 0.4713967368259976}, // k=10 - {0.8577286100002721, 0.5141027441932217}, // k=11 - {0.8314696123025452, 0.5555702330196022}, // k=12 - {0.8032075314806449, 0.5956993044924334}, // k=13 - {0.7730104533627370, 0.6343932841636455}, // k=14 - {0.7409511253549591, 0.6715589548470183}, // k=15 - {0.7071067811865476, 0.7071067811865475}, // k=16 - {0.6715589548470183, 0.7409511253549591}, // k=17 - {0.6343932841636455, 0.7730104533627370}, // k=18 - {0.5956993044924335, 0.8032075314806448}, // k=19 - {0.5555702330196023, 0.8314696123025452}, // k=20 - {0.5141027441932217, 0.8577286100002721}, // k=21 - {0.4713967368259978, 0.8819212643483549}, // k=22 - {0.4275550934302822, 0.9039892931234433}, // k=23 - {0.3826834323650898, 0.9238795325112867}, // k=24 - {0.3368898533922201, 0.9415440651830208}, // k=25 - {0.2902846772544623, 0.9569403357322089}, // k=26 - {0.2429801799032640, 0.9700312531945440}, // k=27 - {0.1950903220161283, 0.9807852804032304}, // k=28 - {0.1467304744553617, 0.9891765099647810}, // k=29 - {0.0980171403295608, 0.9951847266721968}, // k=30 - {0.0490676743274181, 0.9987954562051724}, // k=31 - {0.0000000000000001, 1.0000000000000000}, // k=32 - {-0.0490676743274180, 0.9987954562051724}, // k=33 - {-0.0980171403295606, 0.9951847266721969}, // k=34 - {-0.1467304744553616, 0.9891765099647810}, // k=35 - {-0.1950903220161282, 0.9807852804032304}, // k=36 - {-0.2429801799032639, 0.9700312531945440}, // k=37 - {-0.2902846772544622, 0.9569403357322089}, // k=38 - {-0.3368898533922199, 0.9415440651830208}, // k=39 - {-0.3826834323650897, 0.9238795325112867}, // k=40 - {-0.4275550934302819, 0.9039892931234434}, // k=41 - {-0.4713967368259977, 0.8819212643483550}, // k=42 - {-0.5141027441932217, 0.8577286100002721}, // k=43 - {-0.5555702330196020, 0.8314696123025455}, // k=44 - {-0.5956993044924334, 0.8032075314806449}, // k=45 - {-0.6343932841636454, 0.7730104533627371}, // k=46 - {-0.6715589548470184, 0.7409511253549590}, // k=47 - {-0.7071067811865475, 0.7071067811865476}, // k=48 - {-0.7409511253549589, 0.6715589548470186}, // k=49 - {-0.7730104533627370, 0.6343932841636455}, // k=50 - {-0.8032075314806448, 0.5956993044924335}, // k=51 - {-0.8314696123025453, 0.5555702330196022}, // k=52 - {-0.8577286100002720, 0.5141027441932218}, // k=53 - {-0.8819212643483549, 0.4713967368259979}, // k=54 - {-0.9039892931234433, 0.4275550934302820}, // k=55 - {-0.9238795325112867, 0.3826834323650899}, // k=56 - {-0.9415440651830207, 0.3368898533922203}, // k=57 - {-0.9569403357322088, 0.2902846772544624}, // k=58 - {-0.9700312531945440, 0.2429801799032641}, // k=59 - {-0.9807852804032304, 0.1950903220161286}, // k=60 - {-0.9891765099647810, 0.1467304744553618}, // k=61 - {-0.9951847266721968, 0.0980171403295608}, // k=62 - {-0.9987954562051724, 0.0490676743274180}, // k=63 - {-1.0000000000000000, 0.0000000000000001}, // k=64 - {-0.9987954562051724, -0.0490676743274177}, // k=65 - {-0.9951847266721969, -0.0980171403295606}, // k=66 - {-0.9891765099647810, -0.1467304744553616}, // k=67 - {-0.9807852804032304, -0.1950903220161284}, // k=68 - {-0.9700312531945440, -0.2429801799032638}, // k=69 - {-0.9569403357322089, -0.2902846772544621}, // k=70 - {-0.9415440651830208, -0.3368898533922201}, // k=71 - {-0.9238795325112868, -0.3826834323650897}, // k=72 - {-0.9039892931234434, -0.4275550934302818}, // k=73 - {-0.8819212643483550, -0.4713967368259976}, // k=74 - {-0.8577286100002721, -0.5141027441932216}, // k=75 - {-0.8314696123025455, -0.5555702330196020}, // k=76 - {-0.8032075314806449, -0.5956993044924332}, // k=77 - {-0.7730104533627371, -0.6343932841636453}, // k=78 - {-0.7409511253549591, -0.6715589548470184}, // k=79 - {-0.7071067811865477, -0.7071067811865475}, // k=80 - {-0.6715589548470187, -0.7409511253549589}, // k=81 - {-0.6343932841636459, -0.7730104533627367}, // k=82 - {-0.5956993044924331, -0.8032075314806451}, // k=83 - {-0.5555702330196022, -0.8314696123025452}, // k=84 - {-0.5141027441932218, -0.8577286100002720}, // k=85 - {-0.4713967368259979, -0.8819212643483549}, // k=86 - {-0.4275550934302825, -0.9039892931234431}, // k=87 - {-0.3826834323650903, -0.9238795325112865}, // k=88 - {-0.3368898533922199, -0.9415440651830208}, // k=89 - {-0.2902846772544624, -0.9569403357322088}, // k=90 - {-0.2429801799032641, -0.9700312531945440}, // k=91 - {-0.1950903220161287, -0.9807852804032303}, // k=92 - {-0.1467304744553623, -0.9891765099647809}, // k=93 - {-0.0980171403295605, -0.9951847266721969}, // k=94 - {-0.0490676743274180, -0.9987954562051724}, // k=95 - {-0.0000000000000002, -1.0000000000000000}, // k=96 - {0.0490676743274177, -0.9987954562051724}, // k=97 - {0.0980171403295601, -0.9951847266721969}, // k=98 - {0.1467304744553619, -0.9891765099647809}, // k=99 - {0.1950903220161283, -0.9807852804032304}, // k=100 - {0.2429801799032638, -0.9700312531945440}, // k=101 - {0.2902846772544621, -0.9569403357322089}, // k=102 - {0.3368898533922196, -0.9415440651830209}, // k=103 - {0.3826834323650900, -0.9238795325112866}, // k=104 - {0.4275550934302821, -0.9039892931234433}, // k=105 - {0.4713967368259976, -0.8819212643483550}, // k=106 - {0.5141027441932216, -0.8577286100002722}, // k=107 - {0.5555702330196018, -0.8314696123025455}, // k=108 - {0.5956993044924329, -0.8032075314806453}, // k=109 - {0.6343932841636456, -0.7730104533627369}, // k=110 - {0.6715589548470183, -0.7409511253549591}, // k=111 - {0.7071067811865474, -0.7071067811865477}, // k=112 - {0.7409511253549589, -0.6715589548470187}, // k=113 - {0.7730104533627367, -0.6343932841636459}, // k=114 - {0.8032075314806451, -0.5956993044924332}, // k=115 - {0.8314696123025452, -0.5555702330196022}, // k=116 - {0.8577286100002720, -0.5141027441932219}, // k=117 - {0.8819212643483548, -0.4713967368259979}, // k=118 - {0.9039892931234431, -0.4275550934302825}, // k=119 - {0.9238795325112865, -0.3826834323650904}, // k=120 - {0.9415440651830208, -0.3368898533922200}, // k=121 - {0.9569403357322088, -0.2902846772544625}, // k=122 - {0.9700312531945440, -0.2429801799032642}, // k=123 - {0.9807852804032303, -0.1950903220161287}, // k=124 - {0.9891765099647809, -0.1467304744553624}, // k=125 - {0.9951847266721969, -0.0980171403295605}, // k=126 - {0.9987954562051724, -0.0490676743274181}, // k=127 -}; -__constant__ double2 c_twiddle_fwd_256[256] = { +__constant__ double2 c_twiddle_256[256] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9996988186962042, -0.0245412285229123}, // k=1 {0.9987954562051724, -0.0490676743274180}, // k=2 @@ -757,265 +509,7 @@ __constant__ double2 c_twiddle_fwd_256[256] = { {0.9996988186962042, 0.0245412285229124}, // k=255 }; -__constant__ double2 c_twiddle_inv_256[256] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9996988186962042, 0.0245412285229123}, // k=1 - {0.9987954562051724, 0.0490676743274180}, // k=2 - {0.9972904566786902, 0.0735645635996674}, // k=3 - {0.9951847266721969, 0.0980171403295606}, // k=4 - {0.9924795345987100, 0.1224106751992162}, // k=5 - {0.9891765099647810, 0.1467304744553617}, // k=6 - {0.9852776423889412, 0.1709618887603012}, // k=7 - {0.9807852804032304, 0.1950903220161282}, // k=8 - {0.9757021300385286, 0.2191012401568698}, // k=9 - {0.9700312531945440, 0.2429801799032639}, // k=10 - {0.9637760657954398, 0.2667127574748984}, // k=11 - {0.9569403357322088, 0.2902846772544623}, // k=12 - {0.9495281805930367, 0.3136817403988915}, // k=13 - {0.9415440651830208, 0.3368898533922201}, // k=14 - {0.9329927988347390, 0.3598950365349881}, // k=15 - {0.9238795325112867, 0.3826834323650898}, // k=16 - {0.9142097557035307, 0.4052413140049899}, // k=17 - {0.9039892931234433, 0.4275550934302821}, // k=18 - {0.8932243011955153, 0.4496113296546065}, // k=19 - {0.8819212643483550, 0.4713967368259976}, // k=20 - {0.8700869911087115, 0.4928981922297840}, // k=21 - {0.8577286100002721, 0.5141027441932217}, // k=22 - {0.8448535652497071, 0.5349976198870972}, // k=23 - {0.8314696123025452, 0.5555702330196022}, // k=24 - {0.8175848131515837, 0.5758081914178453}, // k=25 - {0.8032075314806449, 0.5956993044924334}, // k=26 - {0.7883464276266063, 0.6152315905806268}, // k=27 - {0.7730104533627370, 0.6343932841636455}, // k=28 - {0.7572088465064846, 0.6531728429537768}, // k=29 - {0.7409511253549591, 0.6715589548470183}, // k=30 - {0.7242470829514670, 0.6895405447370668}, // k=31 - {0.7071067811865476, 0.7071067811865475}, // k=32 - {0.6895405447370669, 0.7242470829514669}, // k=33 - {0.6715589548470183, 0.7409511253549591}, // k=34 - {0.6531728429537768, 0.7572088465064845}, // k=35 - {0.6343932841636455, 0.7730104533627370}, // k=36 - {0.6152315905806268, 0.7883464276266062}, // k=37 - {0.5956993044924335, 0.8032075314806448}, // k=38 - {0.5758081914178453, 0.8175848131515837}, // k=39 - {0.5555702330196023, 0.8314696123025452}, // k=40 - {0.5349976198870973, 0.8448535652497070}, // k=41 - {0.5141027441932217, 0.8577286100002721}, // k=42 - {0.4928981922297841, 0.8700869911087113}, // k=43 - {0.4713967368259978, 0.8819212643483549}, // k=44 - {0.4496113296546066, 0.8932243011955153}, // k=45 - {0.4275550934302822, 0.9039892931234433}, // k=46 - {0.4052413140049899, 0.9142097557035307}, // k=47 - {0.3826834323650898, 0.9238795325112867}, // k=48 - {0.3598950365349883, 0.9329927988347388}, // k=49 - {0.3368898533922201, 0.9415440651830208}, // k=50 - {0.3136817403988916, 0.9495281805930367}, // k=51 - {0.2902846772544623, 0.9569403357322089}, // k=52 - {0.2667127574748984, 0.9637760657954398}, // k=53 - {0.2429801799032640, 0.9700312531945440}, // k=54 - {0.2191012401568698, 0.9757021300385286}, // k=55 - {0.1950903220161283, 0.9807852804032304}, // k=56 - {0.1709618887603014, 0.9852776423889412}, // k=57 - {0.1467304744553617, 0.9891765099647810}, // k=58 - {0.1224106751992163, 0.9924795345987100}, // k=59 - {0.0980171403295608, 0.9951847266721968}, // k=60 - {0.0735645635996675, 0.9972904566786902}, // k=61 - {0.0490676743274181, 0.9987954562051724}, // k=62 - {0.0245412285229123, 0.9996988186962042}, // k=63 - {0.0000000000000001, 1.0000000000000000}, // k=64 - {-0.0245412285229121, 0.9996988186962042}, // k=65 - {-0.0490676743274180, 0.9987954562051724}, // k=66 - {-0.0735645635996673, 0.9972904566786902}, // k=67 - {-0.0980171403295606, 0.9951847266721969}, // k=68 - {-0.1224106751992162, 0.9924795345987100}, // k=69 - {-0.1467304744553616, 0.9891765099647810}, // k=70 - {-0.1709618887603012, 0.9852776423889412}, // k=71 - {-0.1950903220161282, 0.9807852804032304}, // k=72 - {-0.2191012401568697, 0.9757021300385286}, // k=73 - {-0.2429801799032639, 0.9700312531945440}, // k=74 - {-0.2667127574748983, 0.9637760657954398}, // k=75 - {-0.2902846772544622, 0.9569403357322089}, // k=76 - {-0.3136817403988914, 0.9495281805930367}, // k=77 - {-0.3368898533922199, 0.9415440651830208}, // k=78 - {-0.3598950365349882, 0.9329927988347388}, // k=79 - {-0.3826834323650897, 0.9238795325112867}, // k=80 - {-0.4052413140049897, 0.9142097557035307}, // k=81 - {-0.4275550934302819, 0.9039892931234434}, // k=82 - {-0.4496113296546067, 0.8932243011955152}, // k=83 - {-0.4713967368259977, 0.8819212643483550}, // k=84 - {-0.4928981922297840, 0.8700869911087115}, // k=85 - {-0.5141027441932217, 0.8577286100002721}, // k=86 - {-0.5349976198870970, 0.8448535652497072}, // k=87 - {-0.5555702330196020, 0.8314696123025455}, // k=88 - {-0.5758081914178453, 0.8175848131515837}, // k=89 - {-0.5956993044924334, 0.8032075314806449}, // k=90 - {-0.6152315905806267, 0.7883464276266063}, // k=91 - {-0.6343932841636454, 0.7730104533627371}, // k=92 - {-0.6531728429537765, 0.7572088465064847}, // k=93 - {-0.6715589548470184, 0.7409511253549590}, // k=94 - {-0.6895405447370669, 0.7242470829514669}, // k=95 - {-0.7071067811865475, 0.7071067811865476}, // k=96 - {-0.7242470829514668, 0.6895405447370671}, // k=97 - {-0.7409511253549589, 0.6715589548470186}, // k=98 - {-0.7572088465064846, 0.6531728429537766}, // k=99 - {-0.7730104533627370, 0.6343932841636455}, // k=100 - {-0.7883464276266062, 0.6152315905806269}, // k=101 - {-0.8032075314806448, 0.5956993044924335}, // k=102 - {-0.8175848131515836, 0.5758081914178454}, // k=103 - {-0.8314696123025453, 0.5555702330196022}, // k=104 - {-0.8448535652497071, 0.5349976198870972}, // k=105 - {-0.8577286100002720, 0.5141027441932218}, // k=106 - {-0.8700869911087113, 0.4928981922297841}, // k=107 - {-0.8819212643483549, 0.4713967368259979}, // k=108 - {-0.8932243011955152, 0.4496113296546069}, // k=109 - {-0.9039892931234433, 0.4275550934302820}, // k=110 - {-0.9142097557035307, 0.4052413140049899}, // k=111 - {-0.9238795325112867, 0.3826834323650899}, // k=112 - {-0.9329927988347388, 0.3598950365349883}, // k=113 - {-0.9415440651830207, 0.3368898533922203}, // k=114 - {-0.9495281805930367, 0.3136817403988914}, // k=115 - {-0.9569403357322088, 0.2902846772544624}, // k=116 - {-0.9637760657954398, 0.2667127574748985}, // k=117 - {-0.9700312531945440, 0.2429801799032641}, // k=118 - {-0.9757021300385285, 0.2191012401568700}, // k=119 - {-0.9807852804032304, 0.1950903220161286}, // k=120 - {-0.9852776423889412, 0.1709618887603012}, // k=121 - {-0.9891765099647810, 0.1467304744553618}, // k=122 - {-0.9924795345987100, 0.1224106751992163}, // k=123 - {-0.9951847266721968, 0.0980171403295608}, // k=124 - {-0.9972904566786902, 0.0735645635996677}, // k=125 - {-0.9987954562051724, 0.0490676743274180}, // k=126 - {-0.9996988186962042, 0.0245412285229123}, // k=127 - {-1.0000000000000000, 0.0000000000000001}, // k=128 - {-0.9996988186962042, -0.0245412285229121}, // k=129 - {-0.9987954562051724, -0.0490676743274177}, // k=130 - {-0.9972904566786902, -0.0735645635996675}, // k=131 - {-0.9951847266721969, -0.0980171403295606}, // k=132 - {-0.9924795345987100, -0.1224106751992161}, // k=133 - {-0.9891765099647810, -0.1467304744553616}, // k=134 - {-0.9852776423889413, -0.1709618887603010}, // k=135 - {-0.9807852804032304, -0.1950903220161284}, // k=136 - {-0.9757021300385286, -0.2191012401568698}, // k=137 - {-0.9700312531945440, -0.2429801799032638}, // k=138 - {-0.9637760657954400, -0.2667127574748983}, // k=139 - {-0.9569403357322089, -0.2902846772544621}, // k=140 - {-0.9495281805930368, -0.3136817403988912}, // k=141 - {-0.9415440651830208, -0.3368898533922201}, // k=142 - {-0.9329927988347390, -0.3598950365349881}, // k=143 - {-0.9238795325112868, -0.3826834323650897}, // k=144 - {-0.9142097557035307, -0.4052413140049897}, // k=145 - {-0.9039892931234434, -0.4275550934302818}, // k=146 - {-0.8932243011955153, -0.4496113296546067}, // k=147 - {-0.8819212643483550, -0.4713967368259976}, // k=148 - {-0.8700869911087115, -0.4928981922297839}, // k=149 - {-0.8577286100002721, -0.5141027441932216}, // k=150 - {-0.8448535652497072, -0.5349976198870969}, // k=151 - {-0.8314696123025455, -0.5555702330196020}, // k=152 - {-0.8175848131515837, -0.5758081914178453}, // k=153 - {-0.8032075314806449, -0.5956993044924332}, // k=154 - {-0.7883464276266063, -0.6152315905806267}, // k=155 - {-0.7730104533627371, -0.6343932841636453}, // k=156 - {-0.7572088465064848, -0.6531728429537765}, // k=157 - {-0.7409511253549591, -0.6715589548470184}, // k=158 - {-0.7242470829514670, -0.6895405447370668}, // k=159 - {-0.7071067811865477, -0.7071067811865475}, // k=160 - {-0.6895405447370671, -0.7242470829514668}, // k=161 - {-0.6715589548470187, -0.7409511253549589}, // k=162 - {-0.6531728429537771, -0.7572088465064842}, // k=163 - {-0.6343932841636459, -0.7730104533627367}, // k=164 - {-0.6152315905806273, -0.7883464276266059}, // k=165 - {-0.5956993044924331, -0.8032075314806451}, // k=166 - {-0.5758081914178452, -0.8175848131515838}, // k=167 - {-0.5555702330196022, -0.8314696123025452}, // k=168 - {-0.5349976198870973, -0.8448535652497070}, // k=169 - {-0.5141027441932218, -0.8577286100002720}, // k=170 - {-0.4928981922297842, -0.8700869911087113}, // k=171 - {-0.4713967368259979, -0.8819212643483549}, // k=172 - {-0.4496113296546069, -0.8932243011955152}, // k=173 - {-0.4275550934302825, -0.9039892931234431}, // k=174 - {-0.4052413140049904, -0.9142097557035305}, // k=175 - {-0.3826834323650903, -0.9238795325112865}, // k=176 - {-0.3598950365349879, -0.9329927988347390}, // k=177 - {-0.3368898533922199, -0.9415440651830208}, // k=178 - {-0.3136817403988915, -0.9495281805930367}, // k=179 - {-0.2902846772544624, -0.9569403357322088}, // k=180 - {-0.2667127574748985, -0.9637760657954398}, // k=181 - {-0.2429801799032641, -0.9700312531945440}, // k=182 - {-0.2191012401568701, -0.9757021300385285}, // k=183 - {-0.1950903220161287, -0.9807852804032303}, // k=184 - {-0.1709618887603017, -0.9852776423889411}, // k=185 - {-0.1467304744553623, -0.9891765099647809}, // k=186 - {-0.1224106751992160, -0.9924795345987101}, // k=187 - {-0.0980171403295605, -0.9951847266721969}, // k=188 - {-0.0735645635996674, -0.9972904566786902}, // k=189 - {-0.0490676743274180, -0.9987954562051724}, // k=190 - {-0.0245412285229124, -0.9996988186962042}, // k=191 - {-0.0000000000000002, -1.0000000000000000}, // k=192 - {0.0245412285229120, -0.9996988186962042}, // k=193 - {0.0490676743274177, -0.9987954562051724}, // k=194 - {0.0735645635996670, -0.9972904566786902}, // k=195 - {0.0980171403295601, -0.9951847266721969}, // k=196 - {0.1224106751992156, -0.9924795345987101}, // k=197 - {0.1467304744553619, -0.9891765099647809}, // k=198 - {0.1709618887603013, -0.9852776423889412}, // k=199 - {0.1950903220161283, -0.9807852804032304}, // k=200 - {0.2191012401568697, -0.9757021300385286}, // k=201 - {0.2429801799032638, -0.9700312531945440}, // k=202 - {0.2667127574748982, -0.9637760657954400}, // k=203 - {0.2902846772544621, -0.9569403357322089}, // k=204 - {0.3136817403988911, -0.9495281805930368}, // k=205 - {0.3368898533922196, -0.9415440651830209}, // k=206 - {0.3598950365349876, -0.9329927988347391}, // k=207 - {0.3826834323650900, -0.9238795325112866}, // k=208 - {0.4052413140049900, -0.9142097557035306}, // k=209 - {0.4275550934302821, -0.9039892931234433}, // k=210 - {0.4496113296546066, -0.8932243011955153}, // k=211 - {0.4713967368259976, -0.8819212643483550}, // k=212 - {0.4928981922297839, -0.8700869911087115}, // k=213 - {0.5141027441932216, -0.8577286100002722}, // k=214 - {0.5349976198870969, -0.8448535652497072}, // k=215 - {0.5555702330196018, -0.8314696123025455}, // k=216 - {0.5758081914178449, -0.8175848131515840}, // k=217 - {0.5956993044924329, -0.8032075314806453}, // k=218 - {0.6152315905806270, -0.7883464276266061}, // k=219 - {0.6343932841636456, -0.7730104533627369}, // k=220 - {0.6531728429537768, -0.7572088465064846}, // k=221 - {0.6715589548470183, -0.7409511253549591}, // k=222 - {0.6895405447370668, -0.7242470829514670}, // k=223 - {0.7071067811865474, -0.7071067811865477}, // k=224 - {0.7242470829514667, -0.6895405447370672}, // k=225 - {0.7409511253549589, -0.6715589548470187}, // k=226 - {0.7572088465064842, -0.6531728429537771}, // k=227 - {0.7730104533627367, -0.6343932841636459}, // k=228 - {0.7883464276266059, -0.6152315905806274}, // k=229 - {0.8032075314806451, -0.5956993044924332}, // k=230 - {0.8175848131515837, -0.5758081914178452}, // k=231 - {0.8314696123025452, -0.5555702330196022}, // k=232 - {0.8448535652497070, -0.5349976198870973}, // k=233 - {0.8577286100002720, -0.5141027441932219}, // k=234 - {0.8700869911087113, -0.4928981922297843}, // k=235 - {0.8819212643483548, -0.4713967368259979}, // k=236 - {0.8932243011955151, -0.4496113296546070}, // k=237 - {0.9039892931234431, -0.4275550934302825}, // k=238 - {0.9142097557035305, -0.4052413140049904}, // k=239 - {0.9238795325112865, -0.3826834323650904}, // k=240 - {0.9329927988347390, -0.3598950365349880}, // k=241 - {0.9415440651830208, -0.3368898533922200}, // k=242 - {0.9495281805930367, -0.3136817403988915}, // k=243 - {0.9569403357322088, -0.2902846772544625}, // k=244 - {0.9637760657954398, -0.2667127574748986}, // k=245 - {0.9700312531945440, -0.2429801799032642}, // k=246 - {0.9757021300385285, -0.2191012401568702}, // k=247 - {0.9807852804032303, -0.1950903220161287}, // k=248 - {0.9852776423889411, -0.1709618887603018}, // k=249 - {0.9891765099647809, -0.1467304744553624}, // k=250 - {0.9924795345987100, -0.1224106751992160}, // k=251 - {0.9951847266721969, -0.0980171403295605}, // k=252 - {0.9972904566786902, -0.0735645635996674}, // k=253 - {0.9987954562051724, -0.0490676743274181}, // k=254 - {0.9996988186962042, -0.0245412285229124}, // k=255 -}; -__constant__ double2 c_twiddle_fwd_512[512] = { +__constant__ double2 c_twiddle_512[512] = { {1.0000000000000000, -0.0000000000000000}, // k=0 {0.9999247018391445, -0.0122715382857199}, // k=1 {0.9996988186962042, -0.0245412285229123}, // k=2 @@ -1529,518 +1023,3 @@ __constant__ double2 c_twiddle_fwd_512[512] = { {0.9996988186962042, 0.0245412285229124}, // k=510 {0.9999247018391445, 0.0122715382857206}, // k=511 }; - -__constant__ double2 c_twiddle_inv_512[512] = { - {1.0000000000000000, 0.0000000000000000}, // k=0 - {0.9999247018391445, 0.0122715382857199}, // k=1 - {0.9996988186962042, 0.0245412285229123}, // k=2 - {0.9993223845883495, 0.0368072229413588}, // k=3 - {0.9987954562051724, 0.0490676743274180}, // k=4 - {0.9981181129001492, 0.0613207363022086}, // k=5 - {0.9972904566786902, 0.0735645635996674}, // k=6 - {0.9963126121827780, 0.0857973123444399}, // k=7 - {0.9951847266721969, 0.0980171403295606}, // k=8 - {0.9939069700023561, 0.1102222072938831}, // k=9 - {0.9924795345987100, 0.1224106751992162}, // k=10 - {0.9909026354277800, 0.1345807085071262}, // k=11 - {0.9891765099647810, 0.1467304744553617}, // k=12 - {0.9873014181578584, 0.1588581433338614}, // k=13 - {0.9852776423889412, 0.1709618887603012}, // k=14 - {0.9831054874312163, 0.1830398879551410}, // k=15 - {0.9807852804032304, 0.1950903220161282}, // k=16 - {0.9783173707196277, 0.2071113761922186}, // k=17 - {0.9757021300385286, 0.2191012401568698}, // k=18 - {0.9729399522055602, 0.2310581082806711}, // k=19 - {0.9700312531945440, 0.2429801799032639}, // k=20 - {0.9669764710448521, 0.2548656596045146}, // k=21 - {0.9637760657954398, 0.2667127574748984}, // k=22 - {0.9604305194155658, 0.2785196893850531}, // k=23 - {0.9569403357322088, 0.2902846772544623}, // k=24 - {0.9533060403541939, 0.3020059493192281}, // k=25 - {0.9495281805930367, 0.3136817403988915}, // k=26 - {0.9456073253805213, 0.3253102921622629}, // k=27 - {0.9415440651830208, 0.3368898533922201}, // k=28 - {0.9373390119125750, 0.3484186802494346}, // k=29 - {0.9329927988347390, 0.3598950365349881}, // k=30 - {0.9285060804732156, 0.3713171939518375}, // k=31 - {0.9238795325112867, 0.3826834323650898}, // k=32 - {0.9191138516900578, 0.3939920400610481}, // k=33 - {0.9142097557035307, 0.4052413140049899}, // k=34 - {0.9091679830905224, 0.4164295600976372}, // k=35 - {0.9039892931234433, 0.4275550934302821}, // k=36 - {0.8986744656939538, 0.4386162385385277}, // k=37 - {0.8932243011955153, 0.4496113296546065}, // k=38 - {0.8876396204028539, 0.4605387109582400}, // k=39 - {0.8819212643483550, 0.4713967368259976}, // k=40 - {0.8760700941954066, 0.4821837720791227}, // k=41 - {0.8700869911087115, 0.4928981922297840}, // k=42 - {0.8639728561215868, 0.5035383837257176}, // k=43 - {0.8577286100002721, 0.5141027441932217}, // k=44 - {0.8513551931052652, 0.5245896826784689}, // k=45 - {0.8448535652497071, 0.5349976198870972}, // k=46 - {0.8382247055548381, 0.5453249884220465}, // k=47 - {0.8314696123025452, 0.5555702330196022}, // k=48 - {0.8245893027850253, 0.5657318107836131}, // k=49 - {0.8175848131515837, 0.5758081914178453}, // k=50 - {0.8104571982525948, 0.5857978574564389}, // k=51 - {0.8032075314806449, 0.5956993044924334}, // k=52 - {0.7958369046088836, 0.6055110414043255}, // k=53 - {0.7883464276266063, 0.6152315905806268}, // k=54 - {0.7807372285720945, 0.6248594881423863}, // k=55 - {0.7730104533627370, 0.6343932841636455}, // k=56 - {0.7651672656224590, 0.6438315428897914}, // k=57 - {0.7572088465064846, 0.6531728429537768}, // k=58 - {0.7491363945234594, 0.6624157775901718}, // k=59 - {0.7409511253549591, 0.6715589548470183}, // k=60 - {0.7326542716724128, 0.6806009977954530}, // k=61 - {0.7242470829514670, 0.6895405447370668}, // k=62 - {0.7157308252838186, 0.6983762494089729}, // k=63 - {0.7071067811865476, 0.7071067811865475}, // k=64 - {0.6983762494089729, 0.7157308252838186}, // k=65 - {0.6895405447370669, 0.7242470829514669}, // k=66 - {0.6806009977954531, 0.7326542716724128}, // k=67 - {0.6715589548470183, 0.7409511253549591}, // k=68 - {0.6624157775901718, 0.7491363945234593}, // k=69 - {0.6531728429537768, 0.7572088465064845}, // k=70 - {0.6438315428897915, 0.7651672656224590}, // k=71 - {0.6343932841636455, 0.7730104533627370}, // k=72 - {0.6248594881423865, 0.7807372285720944}, // k=73 - {0.6152315905806268, 0.7883464276266062}, // k=74 - {0.6055110414043255, 0.7958369046088835}, // k=75 - {0.5956993044924335, 0.8032075314806448}, // k=76 - {0.5857978574564389, 0.8104571982525948}, // k=77 - {0.5758081914178453, 0.8175848131515837}, // k=78 - {0.5657318107836132, 0.8245893027850253}, // k=79 - {0.5555702330196023, 0.8314696123025452}, // k=80 - {0.5453249884220465, 0.8382247055548380}, // k=81 - {0.5349976198870973, 0.8448535652497070}, // k=82 - {0.5245896826784688, 0.8513551931052652}, // k=83 - {0.5141027441932217, 0.8577286100002721}, // k=84 - {0.5035383837257176, 0.8639728561215867}, // k=85 - {0.4928981922297841, 0.8700869911087113}, // k=86 - {0.4821837720791228, 0.8760700941954066}, // k=87 - {0.4713967368259978, 0.8819212643483549}, // k=88 - {0.4605387109582400, 0.8876396204028539}, // k=89 - {0.4496113296546066, 0.8932243011955153}, // k=90 - {0.4386162385385277, 0.8986744656939538}, // k=91 - {0.4275550934302822, 0.9039892931234433}, // k=92 - {0.4164295600976373, 0.9091679830905223}, // k=93 - {0.4052413140049899, 0.9142097557035307}, // k=94 - {0.3939920400610481, 0.9191138516900578}, // k=95 - {0.3826834323650898, 0.9238795325112867}, // k=96 - {0.3713171939518376, 0.9285060804732155}, // k=97 - {0.3598950365349883, 0.9329927988347388}, // k=98 - {0.3484186802494345, 0.9373390119125750}, // k=99 - {0.3368898533922201, 0.9415440651830208}, // k=100 - {0.3253102921622630, 0.9456073253805213}, // k=101 - {0.3136817403988916, 0.9495281805930367}, // k=102 - {0.3020059493192282, 0.9533060403541938}, // k=103 - {0.2902846772544623, 0.9569403357322089}, // k=104 - {0.2785196893850531, 0.9604305194155658}, // k=105 - {0.2667127574748984, 0.9637760657954398}, // k=106 - {0.2548656596045146, 0.9669764710448521}, // k=107 - {0.2429801799032640, 0.9700312531945440}, // k=108 - {0.2310581082806713, 0.9729399522055601}, // k=109 - {0.2191012401568698, 0.9757021300385286}, // k=110 - {0.2071113761922186, 0.9783173707196277}, // k=111 - {0.1950903220161283, 0.9807852804032304}, // k=112 - {0.1830398879551411, 0.9831054874312163}, // k=113 - {0.1709618887603014, 0.9852776423889412}, // k=114 - {0.1588581433338614, 0.9873014181578584}, // k=115 - {0.1467304744553617, 0.9891765099647810}, // k=116 - {0.1345807085071262, 0.9909026354277800}, // k=117 - {0.1224106751992163, 0.9924795345987100}, // k=118 - {0.1102222072938832, 0.9939069700023561}, // k=119 - {0.0980171403295608, 0.9951847266721968}, // k=120 - {0.0857973123444399, 0.9963126121827780}, // k=121 - {0.0735645635996675, 0.9972904566786902}, // k=122 - {0.0613207363022086, 0.9981181129001492}, // k=123 - {0.0490676743274181, 0.9987954562051724}, // k=124 - {0.0368072229413590, 0.9993223845883495}, // k=125 - {0.0245412285229123, 0.9996988186962042}, // k=126 - {0.0122715382857199, 0.9999247018391445}, // k=127 - {0.0000000000000001, 1.0000000000000000}, // k=128 - {-0.0122715382857198, 0.9999247018391445}, // k=129 - {-0.0245412285229121, 0.9996988186962042}, // k=130 - {-0.0368072229413589, 0.9993223845883495}, // k=131 - {-0.0490676743274180, 0.9987954562051724}, // k=132 - {-0.0613207363022085, 0.9981181129001492}, // k=133 - {-0.0735645635996673, 0.9972904566786902}, // k=134 - {-0.0857973123444398, 0.9963126121827780}, // k=135 - {-0.0980171403295606, 0.9951847266721969}, // k=136 - {-0.1102222072938831, 0.9939069700023561}, // k=137 - {-0.1224106751992162, 0.9924795345987100}, // k=138 - {-0.1345807085071261, 0.9909026354277800}, // k=139 - {-0.1467304744553616, 0.9891765099647810}, // k=140 - {-0.1588581433338613, 0.9873014181578584}, // k=141 - {-0.1709618887603012, 0.9852776423889412}, // k=142 - {-0.1830398879551409, 0.9831054874312163}, // k=143 - {-0.1950903220161282, 0.9807852804032304}, // k=144 - {-0.2071113761922184, 0.9783173707196277}, // k=145 - {-0.2191012401568697, 0.9757021300385286}, // k=146 - {-0.2310581082806711, 0.9729399522055602}, // k=147 - {-0.2429801799032639, 0.9700312531945440}, // k=148 - {-0.2548656596045145, 0.9669764710448521}, // k=149 - {-0.2667127574748983, 0.9637760657954398}, // k=150 - {-0.2785196893850529, 0.9604305194155659}, // k=151 - {-0.2902846772544622, 0.9569403357322089}, // k=152 - {-0.3020059493192281, 0.9533060403541939}, // k=153 - {-0.3136817403988914, 0.9495281805930367}, // k=154 - {-0.3253102921622629, 0.9456073253805214}, // k=155 - {-0.3368898533922199, 0.9415440651830208}, // k=156 - {-0.3484186802494344, 0.9373390119125750}, // k=157 - {-0.3598950365349882, 0.9329927988347388}, // k=158 - {-0.3713171939518375, 0.9285060804732156}, // k=159 - {-0.3826834323650897, 0.9238795325112867}, // k=160 - {-0.3939920400610480, 0.9191138516900578}, // k=161 - {-0.4052413140049897, 0.9142097557035307}, // k=162 - {-0.4164295600976370, 0.9091679830905225}, // k=163 - {-0.4275550934302819, 0.9039892931234434}, // k=164 - {-0.4386162385385274, 0.8986744656939539}, // k=165 - {-0.4496113296546067, 0.8932243011955152}, // k=166 - {-0.4605387109582401, 0.8876396204028539}, // k=167 - {-0.4713967368259977, 0.8819212643483550}, // k=168 - {-0.4821837720791227, 0.8760700941954066}, // k=169 - {-0.4928981922297840, 0.8700869911087115}, // k=170 - {-0.5035383837257175, 0.8639728561215868}, // k=171 - {-0.5141027441932217, 0.8577286100002721}, // k=172 - {-0.5245896826784687, 0.8513551931052652}, // k=173 - {-0.5349976198870970, 0.8448535652497072}, // k=174 - {-0.5453249884220462, 0.8382247055548382}, // k=175 - {-0.5555702330196020, 0.8314696123025455}, // k=176 - {-0.5657318107836132, 0.8245893027850252}, // k=177 - {-0.5758081914178453, 0.8175848131515837}, // k=178 - {-0.5857978574564389, 0.8104571982525948}, // k=179 - {-0.5956993044924334, 0.8032075314806449}, // k=180 - {-0.6055110414043254, 0.7958369046088836}, // k=181 - {-0.6152315905806267, 0.7883464276266063}, // k=182 - {-0.6248594881423862, 0.7807372285720946}, // k=183 - {-0.6343932841636454, 0.7730104533627371}, // k=184 - {-0.6438315428897913, 0.7651672656224591}, // k=185 - {-0.6531728429537765, 0.7572088465064847}, // k=186 - {-0.6624157775901719, 0.7491363945234593}, // k=187 - {-0.6715589548470184, 0.7409511253549590}, // k=188 - {-0.6806009977954530, 0.7326542716724128}, // k=189 - {-0.6895405447370669, 0.7242470829514669}, // k=190 - {-0.6983762494089728, 0.7157308252838187}, // k=191 - {-0.7071067811865475, 0.7071067811865476}, // k=192 - {-0.7157308252838186, 0.6983762494089729}, // k=193 - {-0.7242470829514668, 0.6895405447370671}, // k=194 - {-0.7326542716724127, 0.6806009977954532}, // k=195 - {-0.7409511253549589, 0.6715589548470186}, // k=196 - {-0.7491363945234591, 0.6624157775901720}, // k=197 - {-0.7572088465064846, 0.6531728429537766}, // k=198 - {-0.7651672656224590, 0.6438315428897914}, // k=199 - {-0.7730104533627370, 0.6343932841636455}, // k=200 - {-0.7807372285720945, 0.6248594881423863}, // k=201 - {-0.7883464276266062, 0.6152315905806269}, // k=202 - {-0.7958369046088835, 0.6055110414043257}, // k=203 - {-0.8032075314806448, 0.5956993044924335}, // k=204 - {-0.8104571982525947, 0.5857978574564390}, // k=205 - {-0.8175848131515836, 0.5758081914178454}, // k=206 - {-0.8245893027850251, 0.5657318107836135}, // k=207 - {-0.8314696123025453, 0.5555702330196022}, // k=208 - {-0.8382247055548381, 0.5453249884220464}, // k=209 - {-0.8448535652497071, 0.5349976198870972}, // k=210 - {-0.8513551931052652, 0.5245896826784689}, // k=211 - {-0.8577286100002720, 0.5141027441932218}, // k=212 - {-0.8639728561215867, 0.5035383837257177}, // k=213 - {-0.8700869911087113, 0.4928981922297841}, // k=214 - {-0.8760700941954065, 0.4821837720791229}, // k=215 - {-0.8819212643483549, 0.4713967368259979}, // k=216 - {-0.8876396204028538, 0.4605387109582402}, // k=217 - {-0.8932243011955152, 0.4496113296546069}, // k=218 - {-0.8986744656939539, 0.4386162385385275}, // k=219 - {-0.9039892931234433, 0.4275550934302820}, // k=220 - {-0.9091679830905224, 0.4164295600976372}, // k=221 - {-0.9142097557035307, 0.4052413140049899}, // k=222 - {-0.9191138516900578, 0.3939920400610482}, // k=223 - {-0.9238795325112867, 0.3826834323650899}, // k=224 - {-0.9285060804732155, 0.3713171939518377}, // k=225 - {-0.9329927988347388, 0.3598950365349883}, // k=226 - {-0.9373390119125748, 0.3484186802494348}, // k=227 - {-0.9415440651830207, 0.3368898533922203}, // k=228 - {-0.9456073253805212, 0.3253102921622633}, // k=229 - {-0.9495281805930367, 0.3136817403988914}, // k=230 - {-0.9533060403541939, 0.3020059493192280}, // k=231 - {-0.9569403357322088, 0.2902846772544624}, // k=232 - {-0.9604305194155658, 0.2785196893850532}, // k=233 - {-0.9637760657954398, 0.2667127574748985}, // k=234 - {-0.9669764710448521, 0.2548656596045147}, // k=235 - {-0.9700312531945440, 0.2429801799032641}, // k=236 - {-0.9729399522055601, 0.2310581082806713}, // k=237 - {-0.9757021300385285, 0.2191012401568700}, // k=238 - {-0.9783173707196275, 0.2071113761922188}, // k=239 - {-0.9807852804032304, 0.1950903220161286}, // k=240 - {-0.9831054874312163, 0.1830398879551409}, // k=241 - {-0.9852776423889412, 0.1709618887603012}, // k=242 - {-0.9873014181578584, 0.1588581433338615}, // k=243 - {-0.9891765099647810, 0.1467304744553618}, // k=244 - {-0.9909026354277800, 0.1345807085071263}, // k=245 - {-0.9924795345987100, 0.1224106751992163}, // k=246 - {-0.9939069700023561, 0.1102222072938832}, // k=247 - {-0.9951847266721968, 0.0980171403295608}, // k=248 - {-0.9963126121827780, 0.0857973123444402}, // k=249 - {-0.9972904566786902, 0.0735645635996677}, // k=250 - {-0.9981181129001492, 0.0613207363022085}, // k=251 - {-0.9987954562051724, 0.0490676743274180}, // k=252 - {-0.9993223845883495, 0.0368072229413588}, // k=253 - {-0.9996988186962042, 0.0245412285229123}, // k=254 - {-0.9999247018391445, 0.0122715382857200}, // k=255 - {-1.0000000000000000, 0.0000000000000001}, // k=256 - {-0.9999247018391445, -0.0122715382857198}, // k=257 - {-0.9996988186962042, -0.0245412285229121}, // k=258 - {-0.9993223845883495, -0.0368072229413586}, // k=259 - {-0.9987954562051724, -0.0490676743274177}, // k=260 - {-0.9981181129001492, -0.0613207363022082}, // k=261 - {-0.9972904566786902, -0.0735645635996675}, // k=262 - {-0.9963126121827780, -0.0857973123444399}, // k=263 - {-0.9951847266721969, -0.0980171403295606}, // k=264 - {-0.9939069700023561, -0.1102222072938830}, // k=265 - {-0.9924795345987100, -0.1224106751992161}, // k=266 - {-0.9909026354277800, -0.1345807085071261}, // k=267 - {-0.9891765099647810, -0.1467304744553616}, // k=268 - {-0.9873014181578584, -0.1588581433338612}, // k=269 - {-0.9852776423889413, -0.1709618887603010}, // k=270 - {-0.9831054874312164, -0.1830398879551406}, // k=271 - {-0.9807852804032304, -0.1950903220161284}, // k=272 - {-0.9783173707196277, -0.2071113761922186}, // k=273 - {-0.9757021300385286, -0.2191012401568698}, // k=274 - {-0.9729399522055602, -0.2310581082806711}, // k=275 - {-0.9700312531945440, -0.2429801799032638}, // k=276 - {-0.9669764710448522, -0.2548656596045145}, // k=277 - {-0.9637760657954400, -0.2667127574748983}, // k=278 - {-0.9604305194155659, -0.2785196893850529}, // k=279 - {-0.9569403357322089, -0.2902846772544621}, // k=280 - {-0.9533060403541940, -0.3020059493192278}, // k=281 - {-0.9495281805930368, -0.3136817403988912}, // k=282 - {-0.9456073253805213, -0.3253102921622630}, // k=283 - {-0.9415440651830208, -0.3368898533922201}, // k=284 - {-0.9373390119125750, -0.3484186802494346}, // k=285 - {-0.9329927988347390, -0.3598950365349881}, // k=286 - {-0.9285060804732156, -0.3713171939518374}, // k=287 - {-0.9238795325112868, -0.3826834323650897}, // k=288 - {-0.9191138516900578, -0.3939920400610479}, // k=289 - {-0.9142097557035307, -0.4052413140049897}, // k=290 - {-0.9091679830905225, -0.4164295600976369}, // k=291 - {-0.9039892931234434, -0.4275550934302818}, // k=292 - {-0.8986744656939540, -0.4386162385385273}, // k=293 - {-0.8932243011955153, -0.4496113296546067}, // k=294 - {-0.8876396204028539, -0.4605387109582401}, // k=295 - {-0.8819212643483550, -0.4713967368259976}, // k=296 - {-0.8760700941954066, -0.4821837720791227}, // k=297 - {-0.8700869911087115, -0.4928981922297839}, // k=298 - {-0.8639728561215868, -0.5035383837257175}, // k=299 - {-0.8577286100002721, -0.5141027441932216}, // k=300 - {-0.8513551931052653, -0.5245896826784687}, // k=301 - {-0.8448535652497072, -0.5349976198870969}, // k=302 - {-0.8382247055548382, -0.5453249884220461}, // k=303 - {-0.8314696123025455, -0.5555702330196020}, // k=304 - {-0.8245893027850253, -0.5657318107836132}, // k=305 - {-0.8175848131515837, -0.5758081914178453}, // k=306 - {-0.8104571982525948, -0.5857978574564389}, // k=307 - {-0.8032075314806449, -0.5956993044924332}, // k=308 - {-0.7958369046088836, -0.6055110414043254}, // k=309 - {-0.7883464276266063, -0.6152315905806267}, // k=310 - {-0.7807372285720946, -0.6248594881423862}, // k=311 - {-0.7730104533627371, -0.6343932841636453}, // k=312 - {-0.7651672656224591, -0.6438315428897913}, // k=313 - {-0.7572088465064848, -0.6531728429537765}, // k=314 - {-0.7491363945234593, -0.6624157775901718}, // k=315 - {-0.7409511253549591, -0.6715589548470184}, // k=316 - {-0.7326542716724128, -0.6806009977954530}, // k=317 - {-0.7242470829514670, -0.6895405447370668}, // k=318 - {-0.7157308252838187, -0.6983762494089728}, // k=319 - {-0.7071067811865477, -0.7071067811865475}, // k=320 - {-0.6983762494089730, -0.7157308252838185}, // k=321 - {-0.6895405447370671, -0.7242470829514668}, // k=322 - {-0.6806009977954532, -0.7326542716724126}, // k=323 - {-0.6715589548470187, -0.7409511253549589}, // k=324 - {-0.6624157775901720, -0.7491363945234590}, // k=325 - {-0.6531728429537771, -0.7572088465064842}, // k=326 - {-0.6438315428897915, -0.7651672656224590}, // k=327 - {-0.6343932841636459, -0.7730104533627367}, // k=328 - {-0.6248594881423865, -0.7807372285720944}, // k=329 - {-0.6152315905806273, -0.7883464276266059}, // k=330 - {-0.6055110414043257, -0.7958369046088835}, // k=331 - {-0.5956993044924331, -0.8032075314806451}, // k=332 - {-0.5857978574564391, -0.8104571982525947}, // k=333 - {-0.5758081914178452, -0.8175848131515838}, // k=334 - {-0.5657318107836135, -0.8245893027850251}, // k=335 - {-0.5555702330196022, -0.8314696123025452}, // k=336 - {-0.5453249884220468, -0.8382247055548379}, // k=337 - {-0.5349976198870973, -0.8448535652497070}, // k=338 - {-0.5245896826784694, -0.8513551931052649}, // k=339 - {-0.5141027441932218, -0.8577286100002720}, // k=340 - {-0.5035383837257180, -0.8639728561215865}, // k=341 - {-0.4928981922297842, -0.8700869911087113}, // k=342 - {-0.4821837720791226, -0.8760700941954067}, // k=343 - {-0.4713967368259979, -0.8819212643483549}, // k=344 - {-0.4605387109582399, -0.8876396204028540}, // k=345 - {-0.4496113296546069, -0.8932243011955152}, // k=346 - {-0.4386162385385276, -0.8986744656939538}, // k=347 - {-0.4275550934302825, -0.9039892931234431}, // k=348 - {-0.4164295600976372, -0.9091679830905224}, // k=349 - {-0.4052413140049904, -0.9142097557035305}, // k=350 - {-0.3939920400610482, -0.9191138516900577}, // k=351 - {-0.3826834323650903, -0.9238795325112865}, // k=352 - {-0.3713171939518378, -0.9285060804732155}, // k=353 - {-0.3598950365349879, -0.9329927988347390}, // k=354 - {-0.3484186802494348, -0.9373390119125748}, // k=355 - {-0.3368898533922199, -0.9415440651830208}, // k=356 - {-0.3253102921622633, -0.9456073253805212}, // k=357 - {-0.3136817403988915, -0.9495281805930367}, // k=358 - {-0.3020059493192285, -0.9533060403541938}, // k=359 - {-0.2902846772544624, -0.9569403357322088}, // k=360 - {-0.2785196893850536, -0.9604305194155657}, // k=361 - {-0.2667127574748985, -0.9637760657954398}, // k=362 - {-0.2548656596045143, -0.9669764710448522}, // k=363 - {-0.2429801799032641, -0.9700312531945440}, // k=364 - {-0.2310581082806709, -0.9729399522055602}, // k=365 - {-0.2191012401568701, -0.9757021300385285}, // k=366 - {-0.2071113761922185, -0.9783173707196277}, // k=367 - {-0.1950903220161287, -0.9807852804032303}, // k=368 - {-0.1830398879551410, -0.9831054874312163}, // k=369 - {-0.1709618887603017, -0.9852776423889411}, // k=370 - {-0.1588581433338615, -0.9873014181578583}, // k=371 - {-0.1467304744553623, -0.9891765099647809}, // k=372 - {-0.1345807085071264, -0.9909026354277800}, // k=373 - {-0.1224106751992160, -0.9924795345987101}, // k=374 - {-0.1102222072938833, -0.9939069700023561}, // k=375 - {-0.0980171403295605, -0.9951847266721969}, // k=376 - {-0.0857973123444402, -0.9963126121827780}, // k=377 - {-0.0735645635996674, -0.9972904566786902}, // k=378 - {-0.0613207363022090, -0.9981181129001492}, // k=379 - {-0.0490676743274180, -0.9987954562051724}, // k=380 - {-0.0368072229413593, -0.9993223845883494}, // k=381 - {-0.0245412285229124, -0.9996988186962042}, // k=382 - {-0.0122715382857205, -0.9999247018391445}, // k=383 - {-0.0000000000000002, -1.0000000000000000}, // k=384 - {0.0122715382857201, -0.9999247018391445}, // k=385 - {0.0245412285229120, -0.9996988186962042}, // k=386 - {0.0368072229413590, -0.9993223845883495}, // k=387 - {0.0490676743274177, -0.9987954562051724}, // k=388 - {0.0613207363022086, -0.9981181129001492}, // k=389 - {0.0735645635996670, -0.9972904566786902}, // k=390 - {0.0857973123444399, -0.9963126121827780}, // k=391 - {0.0980171403295601, -0.9951847266721969}, // k=392 - {0.1102222072938829, -0.9939069700023561}, // k=393 - {0.1224106751992156, -0.9924795345987101}, // k=394 - {0.1345807085071260, -0.9909026354277800}, // k=395 - {0.1467304744553619, -0.9891765099647809}, // k=396 - {0.1588581433338612, -0.9873014181578584}, // k=397 - {0.1709618887603013, -0.9852776423889412}, // k=398 - {0.1830398879551406, -0.9831054874312164}, // k=399 - {0.1950903220161283, -0.9807852804032304}, // k=400 - {0.2071113761922181, -0.9783173707196278}, // k=401 - {0.2191012401568697, -0.9757021300385286}, // k=402 - {0.2310581082806706, -0.9729399522055603}, // k=403 - {0.2429801799032638, -0.9700312531945440}, // k=404 - {0.2548656596045140, -0.9669764710448523}, // k=405 - {0.2667127574748982, -0.9637760657954400}, // k=406 - {0.2785196893850533, -0.9604305194155658}, // k=407 - {0.2902846772544621, -0.9569403357322089}, // k=408 - {0.3020059493192281, -0.9533060403541939}, // k=409 - {0.3136817403988911, -0.9495281805930368}, // k=410 - {0.3253102921622629, -0.9456073253805213}, // k=411 - {0.3368898533922196, -0.9415440651830209}, // k=412 - {0.3484186802494345, -0.9373390119125750}, // k=413 - {0.3598950365349876, -0.9329927988347391}, // k=414 - {0.3713171939518374, -0.9285060804732156}, // k=415 - {0.3826834323650900, -0.9238795325112866}, // k=416 - {0.3939920400610479, -0.9191138516900579}, // k=417 - {0.4052413140049900, -0.9142097557035306}, // k=418 - {0.4164295600976369, -0.9091679830905225}, // k=419 - {0.4275550934302821, -0.9039892931234433}, // k=420 - {0.4386162385385273, -0.8986744656939540}, // k=421 - {0.4496113296546066, -0.8932243011955153}, // k=422 - {0.4605387109582396, -0.8876396204028542}, // k=423 - {0.4713967368259976, -0.8819212643483550}, // k=424 - {0.4821837720791222, -0.8760700941954069}, // k=425 - {0.4928981922297839, -0.8700869911087115}, // k=426 - {0.5035383837257178, -0.8639728561215866}, // k=427 - {0.5141027441932216, -0.8577286100002722}, // k=428 - {0.5245896826784691, -0.8513551931052651}, // k=429 - {0.5349976198870969, -0.8448535652497072}, // k=430 - {0.5453249884220465, -0.8382247055548380}, // k=431 - {0.5555702330196018, -0.8314696123025455}, // k=432 - {0.5657318107836131, -0.8245893027850253}, // k=433 - {0.5758081914178449, -0.8175848131515840}, // k=434 - {0.5857978574564388, -0.8104571982525949}, // k=435 - {0.5956993044924329, -0.8032075314806453}, // k=436 - {0.6055110414043253, -0.7958369046088837}, // k=437 - {0.6152315905806270, -0.7883464276266061}, // k=438 - {0.6248594881423861, -0.7807372285720946}, // k=439 - {0.6343932841636456, -0.7730104533627369}, // k=440 - {0.6438315428897912, -0.7651672656224592}, // k=441 - {0.6531728429537768, -0.7572088465064846}, // k=442 - {0.6624157775901715, -0.7491363945234596}, // k=443 - {0.6715589548470183, -0.7409511253549591}, // k=444 - {0.6806009977954527, -0.7326542716724131}, // k=445 - {0.6895405447370668, -0.7242470829514670}, // k=446 - {0.6983762494089724, -0.7157308252838190}, // k=447 - {0.7071067811865474, -0.7071067811865477}, // k=448 - {0.7157308252838188, -0.6983762494089727}, // k=449 - {0.7242470829514667, -0.6895405447370672}, // k=450 - {0.7326542716724129, -0.6806009977954530}, // k=451 - {0.7409511253549589, -0.6715589548470187}, // k=452 - {0.7491363945234594, -0.6624157775901718}, // k=453 - {0.7572088465064842, -0.6531728429537771}, // k=454 - {0.7651672656224588, -0.6438315428897915}, // k=455 - {0.7730104533627367, -0.6343932841636459}, // k=456 - {0.7807372285720944, -0.6248594881423865}, // k=457 - {0.7883464276266059, -0.6152315905806274}, // k=458 - {0.7958369046088833, -0.6055110414043257}, // k=459 - {0.8032075314806451, -0.5956993044924332}, // k=460 - {0.8104571982525947, -0.5857978574564391}, // k=461 - {0.8175848131515837, -0.5758081914178452}, // k=462 - {0.8245893027850251, -0.5657318107836136}, // k=463 - {0.8314696123025452, -0.5555702330196022}, // k=464 - {0.8382247055548377, -0.5453249884220468}, // k=465 - {0.8448535652497070, -0.5349976198870973}, // k=466 - {0.8513551931052649, -0.5245896826784694}, // k=467 - {0.8577286100002720, -0.5141027441932219}, // k=468 - {0.8639728561215864, -0.5035383837257181}, // k=469 - {0.8700869911087113, -0.4928981922297843}, // k=470 - {0.8760700941954067, -0.4821837720791226}, // k=471 - {0.8819212643483548, -0.4713967368259979}, // k=472 - {0.8876396204028539, -0.4605387109582399}, // k=473 - {0.8932243011955151, -0.4496113296546070}, // k=474 - {0.8986744656939538, -0.4386162385385277}, // k=475 - {0.9039892931234431, -0.4275550934302825}, // k=476 - {0.9091679830905224, -0.4164295600976373}, // k=477 - {0.9142097557035305, -0.4052413140049904}, // k=478 - {0.9191138516900577, -0.3939920400610483}, // k=479 - {0.9238795325112865, -0.3826834323650904}, // k=480 - {0.9285060804732155, -0.3713171939518378}, // k=481 - {0.9329927988347390, -0.3598950365349880}, // k=482 - {0.9373390119125748, -0.3484186802494349}, // k=483 - {0.9415440651830208, -0.3368898533922200}, // k=484 - {0.9456073253805212, -0.3253102921622634}, // k=485 - {0.9495281805930367, -0.3136817403988915}, // k=486 - {0.9533060403541936, -0.3020059493192286}, // k=487 - {0.9569403357322088, -0.2902846772544625}, // k=488 - {0.9604305194155657, -0.2785196893850537}, // k=489 - {0.9637760657954398, -0.2667127574748986}, // k=490 - {0.9669764710448522, -0.2548656596045144}, // k=491 - {0.9700312531945440, -0.2429801799032642}, // k=492 - {0.9729399522055602, -0.2310581082806710}, // k=493 - {0.9757021300385285, -0.2191012401568702}, // k=494 - {0.9783173707196277, -0.2071113761922185}, // k=495 - {0.9807852804032303, -0.1950903220161287}, // k=496 - {0.9831054874312163, -0.1830398879551410}, // k=497 - {0.9852776423889411, -0.1709618887603018}, // k=498 - {0.9873014181578583, -0.1588581433338616}, // k=499 - {0.9891765099647809, -0.1467304744553624}, // k=500 - {0.9909026354277800, -0.1345807085071264}, // k=501 - {0.9924795345987100, -0.1224106751992160}, // k=502 - {0.9939069700023561, -0.1102222072938834}, // k=503 - {0.9951847266721969, -0.0980171403295605}, // k=504 - {0.9963126121827780, -0.0857973123444403}, // k=505 - {0.9972904566786902, -0.0735645635996674}, // k=506 - {0.9981181129001492, -0.0613207363022091}, // k=507 - {0.9987954562051724, -0.0490676743274181}, // k=508 - {0.9993223845883494, -0.0368072229413594}, // k=509 - {0.9996988186962042, -0.0245412285229124}, // k=510 - {0.9999247018391445, -0.0122715382857206}, // k=511 -}; From 932f156bfc045f80f28b2dcf72b086b45656e386 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 09:41:22 -0700 Subject: [PATCH 24/29] Fixup: cleanup and run GPU fft on its own stream --- src/mesh/parallel/shiftedmetric.cxx | 90 +++++++++++++++-------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 759c5aaa0a..1b92313117 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -72,7 +72,6 @@ void ShiftedMetric::cachePhases() { toAlignedPhs = Tensor(mesh.LocalNx, mesh.LocalNy, nmodes); // To/From field aligned phases - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -111,7 +110,6 @@ void ShiftedMetric::cachePhases() { // Parallel slice phases -- note we don't shift in the boundaries/guards for (auto& slice : parallel_slice_phases) { - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { int ix = i.x(); @@ -173,7 +171,6 @@ Field3D ShiftedMetric::shiftZ(const Field3D& f, const Tensor& phs, Field3D result{emptyFrom(f).setDirectionY(y_direction_out)}; - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D(toString(region))) { shiftZ(&f(i, 0), &phs(i.x(), i.y(), 0), &result(i, 0)); } @@ -204,8 +201,7 @@ FieldPerp ShiftedMetric::shiftZ(const FieldPerp& f, const Tensor& phs, return result; } -void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out, - int num_batches) const { +void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* out) const { #if BOUT_HAS_UMPIRE // TODO: This static keyword is a hotfix and should be removed in // future iterations. It is here because otherwise many allocations @@ -217,7 +213,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou #endif // Take forward FFT - rfft(in, mesh.LocalNz * num_batches, &cmplx[0]); + rfft(in, mesh.LocalNz, &cmplx[0]); // Following is an algorithm approach to write a = a*b where a and b are // vectors of dcomplex. @@ -246,13 +242,14 @@ __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { // Block-level cooperative FFT // Multiple threads cooperate on each FFT using shared memory template -__global__ void -fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, - const double2** __restrict__ blocks_phs, const int Nz_runtime, - const int nmodes, const int batches, const int nblocks) { +__global__ void fft_block_cooperative(const BoutReal** __restrict__ in, + BoutReal** __restrict__ out, + const double2** __restrict__ blocks_phs, + const int batches, const int nblocks) { constexpr int LOG2_NZ = __builtin_ctz(NZ); constexpr double INV_NZ = 1.0 / (double)NZ; + constexpr int NMODES = (NZ / 2) + 1; // Shared memory for FFTS_PER_BLOCK FFTs // Each FFT needs NZ complex values @@ -261,15 +258,15 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ // Select twiddles based on size const double2* twiddles; if constexpr (NZ == 16) { - twiddles = c_twiddle_fwd_16; + twiddles = c_twiddle_16; } else if constexpr (NZ == 64) { - twiddles = c_twiddle_fwd_64; + twiddles = c_twiddle_64; } else if constexpr (NZ == 128) { - twiddles = c_twiddle_fwd_128; + twiddles = c_twiddle_128; } else if constexpr (NZ == 256) { - twiddles = c_twiddle_fwd_256; + twiddles = c_twiddle_256; } else if constexpr (NZ == 512) { - twiddles = c_twiddle_fwd_512; + twiddles = c_twiddle_512; } else { static_assert(NZ == 16 || NZ == 64 || NZ == 128 || NZ == 256 || NZ == 512, "Unsupported NZ"); @@ -340,18 +337,18 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ } // ===== APPLY PHASE SHIFT ===== - for (int k = tid; k < nmodes; k += threads_per_fft) { - const double2 ph = phs[batch * nmodes + k]; + for (int k = tid; k < NMODES; k += threads_per_fft) { + const double2 ph = phs[batch * NMODES + k]; const double real = shared_fft[fft_id_in_block][k].x; const double imag = shared_fft[fft_id_in_block][k].y; shared_fft[fft_id_in_block][k].x = real * ph.x - imag * ph.y; shared_fft[fft_id_in_block][k].y = real * ph.y + imag * ph.x; } - for (int k = tid + nmodes; k < NZ; k += threads_per_fft) { - if (k >= nmodes) { + for (int k = tid + NMODES; k < NZ; k += threads_per_fft) { + if (k >= NMODES) { const int kk = NZ - k; - const double2 tmp = phs[batch * nmodes + kk]; + const double2 tmp = phs[batch * NMODES + kk]; const double real = shared_fft[fft_id_in_block][k].x; const double imag = shared_fft[fft_id_in_block][k].y; shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; @@ -418,18 +415,15 @@ fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ } // Launcher for block-level cooperative FFT -static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, - const double2** phs, int nblocks, int batches, +static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, + const double2** phs, int nblocks, int nbatches, cudaStream_t stream = 0) { - int Nz = mesh.LocalNz; - int nmodes = Nz / 2 + 1; - if ((Nz & (Nz - 1)) != 0) { fprintf(stderr, "Error: Nz=%d must be power of 2\n", Nz); return; } - const int total_ffts = nblocks * batches; + const int total_ffts = nblocks * nbatches; if (Nz == 16) { constexpr int FFTS_PER_BLOCK = 16; @@ -439,7 +433,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<16, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 64) { constexpr int FFTS_PER_BLOCK = 4; constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT @@ -448,7 +442,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<64, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 128) { constexpr int FFTS_PER_BLOCK = 2; @@ -458,7 +452,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<128, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 256) { constexpr int FFTS_PER_BLOCK = 1; @@ -468,7 +462,7 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid(total_ffts); fft_block_cooperative<256, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 512) { constexpr int FFTS_PER_BLOCK = 1; @@ -478,10 +472,9 @@ static void shiftZ_block_fft(Mesh& mesh, const BoutReal** in, BoutReal** out, dim3 grid(total_ffts); fft_block_cooperative<512, FFTS_PER_BLOCK> - <<>>(in, out, phs, Nz, nmodes, batches, nblocks); + <<>>(in, out, phs, nbatches, nblocks); } else { - fprintf(stderr, "Unsupported Nz=%d for block FFT\n", Nz); - throw std::runtime_error("Unsupported Nz for block FFT"); + throw std::runtime_error("Unsupported Nz " + std::to_string(Nz) + " for block FFT"); } cudaError_t err = cudaGetLastError(); @@ -516,16 +509,30 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f_slice.allocate(); #if BOUT_HAS_CUDA + static struct StreamRAII { + cudaStream_t stream = 0; + StreamRAII() { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } + + cudaStream_t get() const { return stream; } + + void synchronize() const { cudaStreamSynchronize(stream); } + + ~StreamRAII() { cudaStreamDestroy(stream); } + } stream; size_t block_idx = 0; - int num_batches = + int nbatches = region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); block < end; ++block) { auto idx_s = block->first; auto idx_e = block->second; - int inner_batches = idx_e.ind - idx_s.ind; - if (inner_batches != num_batches) { + int inner_nbatches = idx_e.ind - idx_s.ind; + if (inner_nbatches != nbatches) { throw BoutException( "Non-uniform number of batches in ShiftedMetric::calcParallelSlices"); } @@ -540,12 +547,11 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { block_idx++; } - shiftZ_block_fft(mesh, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, - num_batches, 0); + shiftZ_block_fft(mesh.LocalNz, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, + nbatches, stream.get()); - cudaDeviceSynchronize(); + stream.synchronize(); #else - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -553,8 +559,6 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } - //std::cout << "ShiftedMetric::shiftZ " << __FILE__ << " :" << __LINE__ - // << " count = " << count << " each size " << mesh.LocalNz << "\n"; #endif } } @@ -572,7 +576,6 @@ ShiftedMetric::shiftZ(const Field3D& f, Matrix> f_fft(mesh.LocalNx, mesh.LocalNy); f_fft = Array(nmodes); - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_ALL")) { int ix = i.x(); int iy = i.y(); @@ -587,7 +590,6 @@ ShiftedMetric::shiftZ(const Field3D& f, current_result.allocate(); current_result.setLocation(f.getLocation()); - // std::cout << "[TRACE] BOUT_FOR " << __FILE__ << ":" << __LINE__ << "\n"; BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { // Deep copy the FFT'd field int ix = i.x(); From b6c738c449f9ff32d853687c89169a995bc3a4a2 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 10:25:31 -0700 Subject: [PATCH 25/29] Fixup: remove comments, avoid temp for inverse --- src/mesh/parallel/shiftedmetric.cxx | 36 +++++++++++++---------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 1b92313117..8a71e082d6 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -227,7 +227,6 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } -/* NEW CODE */ // Bit-reversal __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { unsigned int result = 0; @@ -364,16 +363,15 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, } __syncthreads(); - // Bit-reverse for inverse - __shared__ double2 temp_fft[FFTS_PER_BLOCK][NZ]; - for (int i = tid; i < NZ; i += threads_per_fft) { + // Bit-reverse with standard swap to avoid temp array + // This is tricky but saves memory + for (int i = tid; i < NZ / 2; i += threads_per_fft) { const unsigned int rev_i = bit_reverse(i, LOG2_NZ); - temp_fft[fft_id_in_block][rev_i] = shared_fft[fft_id_in_block][i]; - } - __syncthreads(); - - for (int i = tid; i < NZ; i += threads_per_fft) { - shared_fft[fft_id_in_block][i] = temp_fft[fft_id_in_block][i]; + if (i < rev_i) { // Only swap once per pair + double2 temp = shared_fft[fft_id_in_block][i]; + shared_fft[fft_id_in_block][i] = shared_fft[fft_id_in_block][rev_i]; + shared_fft[fft_id_in_block][rev_i] = temp; + } } __syncthreads(); @@ -427,18 +425,18 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, if (Nz == 16) { constexpr int FFTS_PER_BLOCK = 16; - constexpr int THREADS_PER_FFT = 16; // Use 64 threads per FFT + constexpr int THREADS_PER_FFT = 16; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 16 x 16 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<16, FFTS_PER_BLOCK> <<>>(in, out, phs, nbatches, nblocks); } else if (Nz == 64) { constexpr int FFTS_PER_BLOCK = 4; - constexpr int THREADS_PER_FFT = 64; // Use 64 threads per FFT + constexpr int THREADS_PER_FFT = 64; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 64 x 4 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<64, FFTS_PER_BLOCK> @@ -448,7 +446,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, constexpr int FFTS_PER_BLOCK = 2; constexpr int THREADS_PER_FFT = 128; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 128 x 2 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid((total_ffts + FFTS_PER_BLOCK - 1) / FFTS_PER_BLOCK); fft_block_cooperative<128, FFTS_PER_BLOCK> @@ -458,7 +456,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, constexpr int FFTS_PER_BLOCK = 1; constexpr int THREADS_PER_FFT = 256; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 256 x 1 = 256 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid(total_ffts); fft_block_cooperative<256, FFTS_PER_BLOCK> @@ -466,9 +464,9 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, } else if (Nz == 512) { constexpr int FFTS_PER_BLOCK = 1; - constexpr int THREADS_PER_FFT = 512; // 512 threads per FFT + constexpr int THREADS_PER_FFT = 512; - dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); // 512 x 1 = 512 threads + dim3 block(THREADS_PER_FFT, FFTS_PER_BLOCK); dim3 grid(total_ffts); fft_block_cooperative<512, FFTS_PER_BLOCK> @@ -483,8 +481,6 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, } } -/* END NEWER CODE */ - void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { // Cannot calculate parallel slices for field-aligned fields, so return without From 11ebfcd4e9874c7b181f2406e37227dbf4038fd2 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 10:43:12 -0700 Subject: [PATCH 26/29] Fixup: preprocessor guards, better variable naming --- src/mesh/parallel/shiftedmetric.cxx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 8a71e082d6..c05550e348 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -227,6 +227,7 @@ void ShiftedMetric::shiftZ(const BoutReal* in, const dcomplex* phs, BoutReal* ou irfft(&cmplx[0], mesh.LocalNz, out); // Reverse FFT } +#if BOUT_HAS_CUDA // Bit-reversal __device__ inline unsigned int bit_reverse(unsigned int x, unsigned int log2n) { unsigned int result = 0; @@ -244,7 +245,7 @@ template __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, BoutReal** __restrict__ out, const double2** __restrict__ blocks_phs, - const int batches, const int nblocks) { + const int nbatches, const int nblocks) { constexpr int LOG2_NZ = __builtin_ctz(NZ); constexpr double INV_NZ = 1.0 / (double)NZ; @@ -276,11 +277,11 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, threadIdx.y; // Which FFT this thread works on (0 to FFTS_PER_BLOCK-1) const int global_fft_id = blockIdx.x * FFTS_PER_BLOCK + fft_id_in_block; - if (global_fft_id >= nblocks * batches) + if (global_fft_id >= nblocks * nbatches) return; - const int block = global_fft_id / batches; - const int batch = global_fft_id % batches; + const int block = global_fft_id / nbatches; + const int batch = global_fft_id % nbatches; const double* __restrict__ in_line = in[block] + batch * NZ; double* __restrict__ out_line = out[block] + batch * NZ; @@ -480,6 +481,7 @@ static void shiftZ_block_fft(const int Nz, const BoutReal** in, BoutReal** out, throw std::runtime_error(std::string("Block FFT failed: ") + cudaGetErrorString(err)); } } +#endif void ShiftedMetric::calcParallelSlices(Field3D& f) { if (f.getDirectionY() == YDirectionType::Aligned) { @@ -490,8 +492,8 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { f.splitParallelSlices(); +#if BOUT_HAS_CUDA auto& region = mesh.getRegion2D("RGN_NOY"); - static size_t nblocks = region.getBlocks().size(); if (nblocks != region.getBlocks().size()) { throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); @@ -499,6 +501,7 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { static Array blocks_in(nblocks); static Array blocks_out(nblocks); static Array phs_in(nblocks); +#endif for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); From cf5a88291036bc20a5922a5fb1563dc8221623f5 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 11:15:49 -0700 Subject: [PATCH 27/29] Fixup: saner split with BOUT_HAS_CUDA --- src/mesh/parallel/shiftedmetric.cxx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index c05550e348..981034c1ee 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -501,13 +501,11 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { static Array blocks_in(nblocks); static Array blocks_out(nblocks); static Array phs_in(nblocks); -#endif for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset); f_slice.allocate(); -#if BOUT_HAS_CUDA static struct StreamRAII { cudaStream_t stream = 0; StreamRAII() { @@ -550,7 +548,12 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { nbatches, stream.get()); stream.synchronize(); + } #else + for (const auto& phase : parallel_slice_phases) { + auto& f_slice = f.ynext(phase.y_offset); + f_slice.allocate(); + BOUT_FOR(i, mesh.getRegion2D("RGN_NOY")) { const int ix = i.x(); const int iy = i.y(); @@ -558,8 +561,8 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ(&(f(ix, iy_offset, 0)), &(phase.phase_shift(ix, iy, 0)), &(f_slice(ix, iy_offset, 0))); } -#endif } +#endif } std::vector From 9f42dcbf7ffadecde2ab51c1dc308549910d1b61 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Fri, 24 Oct 2025 18:28:25 -0700 Subject: [PATCH 28/29] Fixup: remove redundant conditional --- src/mesh/parallel/shiftedmetric.cxx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 981034c1ee..45f2988561 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -346,14 +346,12 @@ __global__ void fft_block_cooperative(const BoutReal** __restrict__ in, } for (int k = tid + NMODES; k < NZ; k += threads_per_fft) { - if (k >= NMODES) { - const int kk = NZ - k; - const double2 tmp = phs[batch * NMODES + kk]; - const double real = shared_fft[fft_id_in_block][k].x; - const double imag = shared_fft[fft_id_in_block][k].y; - shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; - shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; - } + const int kk = NZ - k; + const double2 tmp = phs[batch * NMODES + kk]; + const double real = shared_fft[fft_id_in_block][k].x; + const double imag = shared_fft[fft_id_in_block][k].y; + shared_fft[fft_id_in_block][k].x = real * tmp.x + imag * tmp.y; + shared_fft[fft_id_in_block][k].y = -real * tmp.y + imag * tmp.x; } __syncthreads(); From 35e6f423859c4f6b5554f005e31576f18cc23c56 Mon Sep 17 00:00:00 2001 From: Giorgis Georgakoudis Date: Mon, 24 Nov 2025 12:31:35 -0800 Subject: [PATCH 29/29] Use streams to reduce synchronization overhead --- include/bout/bout_types.hxx | 7 ++-- include/bout/field2d.hxx | 6 +-- include/bout/field3d.hxx | 40 ++++++++++---------- include/bout/fieldops.hxx | 57 +++++++++++++++++++++++++---- include/bout/rajalib.hxx | 14 ------- src/mesh/parallel/shiftedmetric.cxx | 46 +++++++++++++---------- 6 files changed, 103 insertions(+), 67 deletions(-) diff --git a/include/bout/bout_types.hxx b/include/bout/bout_types.hxx index c725c281d3..03bc4dcee4 100644 --- a/include/bout/bout_types.hxx +++ b/include/bout/bout_types.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -145,8 +145,9 @@ struct Constant { T val; struct View { T v; - View(T v) : v(v) {} - __host__ __device__ T operator()(int) const { return v; } + cudaStream_t stream = 0; + View(T v) : v(v) {} + __host__ __device__ T operator()(int) const { return v; } }; operator View() const { return {val}; } }; diff --git a/include/bout/field2d.hxx b/include/bout/field2d.hxx index da8de551ad..b452df7fd6 100644 --- a/include/bout/field2d.hxx +++ b/include/bout/field2d.hxx @@ -273,10 +273,10 @@ public: #define FIELD2D_OP_EQUALS(OP_SYM) \ template \ std::enable_if_t || is_expr_constant_v, Field2D&> \ - operator OP_SYM##=(R rhs) { \ + operator OP_SYM## = (R rhs) { \ if (data.unique()) { \ - auto BE = (*this)OP_SYM rhs; \ - BE.evaluate(&data[0]); \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ } else { \ (*this) = (*this)OP_SYM rhs; \ } \ diff --git a/include/bout/field3d.hxx b/include/bout/field3d.hxx index 62b299bc48..9dc064d6e6 100644 --- a/include/bout/field3d.hxx +++ b/include/bout/field3d.hxx @@ -2,7 +2,7 @@ * Copyright 2010 B.D.Dudson, S.Farley, M.V.Umansky, X.Q.Xu * * Contact: Ben Dudson, bd512@york.ac.uk - * + * * This file is part of BOUT++. * * BOUT++ is free software: you can redistribute it and/or modify @@ -355,7 +355,7 @@ public: * Direct access to the underlying data array * * If CHECK > 2 then bounds checking is performed - * + * * If CHECK <= 2 then no checks are performed, to * allow inlining and optimisation of inner loops */ @@ -473,19 +473,19 @@ public: ///@} -#define FIELD3D_OP_EQUALS(OP_SYM) \ - template \ - std::enable_if_t || is_expr_field2d_v \ - || is_expr_constant_v, \ - Field3D&> operator OP_SYM##=(const R & rhs) { \ - if (data.unique()) { \ - clearParallelSlices(); \ - auto Expr = (*this)OP_SYM rhs; \ - Expr.evaluate(&data[0]); \ - } else { \ - (*this) = (*this)OP_SYM rhs; \ - } \ - return *this; \ +#define FIELD3D_OP_EQUALS(OP_SYM) \ + template \ + std::enable_if_t< \ + is_expr_field3d_v || is_expr_field2d_v || is_expr_constant_v, Field3D&> \ + operator OP_SYM## = (const R& rhs) { \ + if (data.unique()) { \ + clearParallelSlices(); \ + auto expr = (*this)OP_SYM rhs; \ + expr.evaluate(&data[0]); \ + } else { \ + (*this) = (*this)OP_SYM rhs; \ + } \ + return *this; \ } FIELD3D_OP_EQUALS(+) @@ -565,8 +565,8 @@ FieldPerp operator/(const Field3D& lhs, const FieldPerp& rhs); #define FIELD3D_FIELD3D_FIELD3D_OP(OP_SYM, OP_TYPE) \ template && is_expr_field3d_v>> \ - BinaryExpr operator OP_SYM(const L & lhs, \ - const R & rhs) { \ + BinaryExpr operator OP_SYM(const L& lhs, \ + const R& rhs) { \ auto regionID = \ lhs.getMesh()->getCommonRegion(lhs.getRegionID(), rhs.getRegionID()); \ return BinaryExpr{ \ @@ -590,7 +590,7 @@ FIELD3D_FIELD3D_FIELD3D_OP(/, Div) template \ std::enable_if_t && is_expr_field2d_v, \ BinaryExpr> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + operator OP_SYM(const L& lhs, const R& rhs) { \ auto regionID = lhs.getRegionID(); \ int mesh_nz = lhs.getMesh()->LocalNz; \ return BinaryExpr{ \ @@ -613,7 +613,7 @@ FIELD3D_FIELD3D_FIELD2D_OP(/, Div) template \ std::enable_if_t && is_expr_constant_v, \ BinaryExpr, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, R rhs) { \ + operator OP_SYM(const L& lhs, R rhs) { \ auto regionID = lhs.getRegionID(); \ return BinaryExpr, bout::op::OP_TYPE>{ \ static_cast(lhs), \ @@ -635,7 +635,7 @@ FIELD3D_FIELD3D_BOUTREAL_OP(/, Div) template \ std::enable_if_t && is_expr_field3d_v, \ BinaryExpr, R, bout::op::OP_TYPE>> \ - operator OP_SYM(const L & lhs, const R & rhs) { \ + operator OP_SYM(const L& lhs, const R& rhs) { \ auto regionID = rhs.getRegionID(); \ return BinaryExpr, R, bout::op::OP_TYPE>{ \ static_cast::View>(lhs), \ diff --git a/include/bout/fieldops.hxx b/include/bout/fieldops.hxx index 48d104e3ea..114793ebc8 100644 --- a/include/bout/fieldops.hxx +++ b/include/bout/fieldops.hxx @@ -130,6 +130,40 @@ __global__ void __launch_bounds__(THREADS) evaluatorExpr(BoutReal* out, const Ex inline std::unordered_map> regionIndicesCache; +struct StreamsRAII { + std::vector streams; + + cudaStream_t get() { + cudaStream_t stream = 0; + + if (streams.empty()) { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); + } + } else { + stream = streams.back(); + streams.pop_back(); + } + + return stream; + } + + void put(cudaStream_t stream) { streams.push_back(stream); } + + ~StreamsRAII() { + for (auto& stream : streams) { + cudaStreamDestroy(stream); + } + } + + StreamsRAII() = default; + StreamsRAII(const StreamsRAII&) = delete; + StreamsRAII(StreamsRAII&&) = delete; + StreamsRAII& operator=(const StreamsRAII&) = delete; + StreamsRAII& operator=(StreamsRAII&&) = delete; +}; +inline struct StreamsRAII streams; + template struct BinaryExpr { typename L::View lhs; @@ -220,16 +254,23 @@ struct BinaryExpr { operator View() const { return View{lhs, rhs, &indices[0], indices.size(), f}; } void evaluate(BoutReal* data) const { +#if 1 + cudaStream_t stream = streams.get(); int blocks = (size() + THREADS - 1) / THREADS; - evaluatorExpr<<>>(&data[0], static_cast(*this)); - cudaDeviceSynchronize(); + evaluatorExpr<<>>(&data[0], static_cast(*this)); + cudaStreamSynchronize(stream); + streams.put(stream); +#endif + +#if 0 // OpenMP impl. - //int e = size(); + int e = size(); //#pragma omp parallel for - //for (int i = 0; i < e; ++i) { - // int idx = regionIdx(i); - // data[idx] = operator()(idx); // single‐pass fusion - //} + for (int i = 0; i < e; ++i) { + int idx = regionIdx(i); + data[idx] = operator()(idx); // single‐pass fusion + } +#endif } Mesh* getMesh() const { return mesh; } @@ -238,4 +279,4 @@ struct BinaryExpr { std::optional getRegionID() const { return regionID; }; }; -#endif // BOUT_EXPRESSION_HX \ No newline at end of file +#endif // BOUT_FIELDSOPS_HXX diff --git a/include/bout/rajalib.hxx b/include/bout/rajalib.hxx index d61a58e0d8..20929304b5 100644 --- a/include/bout/rajalib.hxx +++ b/include/bout/rajalib.hxx @@ -139,20 +139,6 @@ private: #define BOUT_FOR_RAJA(index, region, ...) \ RajaForAll(region) << [ =, ##__VA_ARGS__ ] RAJA_DEVICE(int index) mutable -// NEW STUFF - -template -__global__ void evaluator(BoutReal *out, Expr &expr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = tid; i < expr.size(); i += stride) { - out[expr.regionIdx(i)] = expr(expr.regionIdx(i)); // single‐pass fusion - } -} - -// END OF NEW STUFF - - #else // BOUT_HAS_RAJA #warning RAJA not enabled. BOUT_FOR_RAJA falling back to BOUT_FOR. diff --git a/src/mesh/parallel/shiftedmetric.cxx b/src/mesh/parallel/shiftedmetric.cxx index 45f2988561..40085eaf92 100644 --- a/src/mesh/parallel/shiftedmetric.cxx +++ b/src/mesh/parallel/shiftedmetric.cxx @@ -496,32 +496,39 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { if (nblocks != region.getBlocks().size()) { throw BoutException("Number of blocks changed in ShiftedMetric::calcParallelSlices"); } - static Array blocks_in(nblocks); - static Array blocks_out(nblocks); - static Array phs_in(nblocks); - for (const auto& phase : parallel_slice_phases) { - auto& f_slice = f.ynext(phase.y_offset); - f_slice.allocate(); - - static struct StreamRAII { - cudaStream_t stream = 0; - StreamRAII() { - if (cudaStreamCreate(&stream) != cudaSuccess) { - throw BoutException("Failed to create CUDA stream"); - } + static struct StreamRAII { + cudaStream_t stream = 0; + StreamRAII() { + if (cudaStreamCreate(&stream) != cudaSuccess) { + throw BoutException("Failed to create CUDA stream"); } + } + + cudaStream_t get() const { return stream; } + + void synchronize() const { cudaStreamSynchronize(stream); } - cudaStream_t get() const { return stream; } + ~StreamRAII() { cudaStreamDestroy(stream); } + } stream; - void synchronize() const { cudaStreamSynchronize(stream); } + // Vector of Arrays for each phase. + std::vector> blocks_in_phase; + std::vector> blocks_out_phase; + std::vector> phs_in_phase; + + for (const auto& phase : parallel_slice_phases) { + auto& f_slice = f.ynext(phase.y_offset); + f_slice.allocate(); - ~StreamRAII() { cudaStreamDestroy(stream); } - } stream; size_t block_idx = 0; int nbatches = region.getBlocks().cbegin()->second.ind - region.getBlocks().cbegin()->first.ind; + Array& blocks_in = blocks_in_phase.emplace_back(nblocks); + Array& blocks_out = blocks_out_phase.emplace_back(nblocks); + Array& phs_in = phs_in_phase.emplace_back(nblocks); + for (auto block = region.getBlocks().cbegin(), end = region.getBlocks().cend(); block < end; ++block) { auto idx_s = block->first; @@ -544,9 +551,10 @@ void ShiftedMetric::calcParallelSlices(Field3D& f) { shiftZ_block_fft(mesh.LocalNz, &blocks_in[0], &blocks_out[0], &phs_in[0], nblocks, nbatches, stream.get()); - - stream.synchronize(); } + + // Synchronize to ensure all shifts are complete. + stream.synchronize(); #else for (const auto& phase : parallel_slice_phases) { auto& f_slice = f.ynext(phase.y_offset);