From 4b5ae7d3654a81e93c14a5874cc0972c2872c2cf Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 10 May 2026 18:03:49 +0200 Subject: [PATCH] Tentative support for avx512f extensions to 256 bit registers In addition to missing instructions (e.g. bas on int64_t etc) this mostly changes the mask representation from vector register to scalar, thus the big diff. --- .github/workflows/linux.yml | 4 + include/xsimd/arch/xsimd_avx512f_256.hpp | 721 ++++++++++++++++++ include/xsimd/arch/xsimd_isa.hpp | 1 + include/xsimd/config/xsimd_arch.hpp | 2 +- .../xsimd/config/xsimd_cpu_features_x86.hpp | 2 + .../xsimd/types/xsimd_avx512f_register.hpp | 19 + 6 files changed, 748 insertions(+), 1 deletion(-) create mode 100644 include/xsimd/arch/xsimd_avx512f_256.hpp diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 7eec735d8..581c56606 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -32,6 +32,7 @@ jobs: - { compiler: 'clang', version: '18', flags: 'avx512' } - { compiler: 'clang', version: '18', flags: 'avx_128' } - { compiler: 'clang', version: '18', flags: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: 'avx512f_256' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} @@ -94,6 +95,9 @@ jobs: if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi + if [[ '${{ matrix.sys.flags }}' == 'avx512f_256' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512 -DXSIMD_DEFAULT_ARCH=avx512f_256" + fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi diff --git a/include/xsimd/arch/xsimd_avx512f_256.hpp b/include/xsimd/arch/xsimd_avx512f_256.hpp new file mode 100644 index 000000000..0d0ba28a7 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512f_256.hpp @@ -0,0 +1,721 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512F_256_HPP +#define XSIMD_AVX512F_256_HPP + +#include + +#include "../types/xsimd_avx512f_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template + XSIMD_INLINE batch_bool compare_int_avx512f_256(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // load mask + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto iter = size / 4; + static_assert((size % 4) == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * 4); + mask |= (register_type(block) << (i * 4)); + } + return mask; + } + + // from bool + template + XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + return static_cast::register_type>(mask); + } + + // mask + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // set + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = self.data & (register_type(1) << i); + } + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return _mm256_abs_epi64(self); + } + + // load masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + + // store masked + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_ps(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_store_pd(mem, mask.mask(), src); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epi64(self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epu64(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epi64(self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epu64(self, other); + } + + // swizzle (dynamic version) + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_ps(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_pd(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi64(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f_256 {})); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi32(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f_256 {})); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return swizzle(batch { self.data }, batch { mask.data }, avx2 {}).data; + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + + // swizzle + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_pd(self, mask); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_epi64(self, mask); + } + + // insert + template + XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept + { + + int32_t tmp = bit_cast(val); + return _mm256_castsi256_ps(_mm256_mask_set1_epi32(_mm256_castps_si256(self), __mmask8(1 << (I & 7)), tmp)); + } + + template + XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept + { + int64_t tmp = bit_cast(val); + return _mm256_castsi256_pd(_mm256_mask_set1_epi64(_mm256_castpd_si256(self), __mmask8(1 << (I & 3)), tmp)); + } + + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // rotl + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rolv_epi32(self, other); + } + XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rolv_epi64(self, other); + } + return rotl(self, other, avx2 {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept + { + return rotl(self, batch(other), A {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rol_epi32(self, count); + } + XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rol_epi64(self, count); + } + + return rotl(self, avx2 {}); + } + + // rotr + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rorv_epi64(self, other); + } + } + return rotr(self, other, avx2 {}); + } + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept + { + return rotr(self, batch(other), A {}); + } + + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_ror_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_ror_epi64(self, count); + } + } + return rotr(self, avx2 {}); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data == register_type(-1) >> (sizeof(register_type) * 4); + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data != register_type(0); + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f_256(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value>> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f_256(self, other); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f_256(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f_256(self, other); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f_256(self, other); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_ps(cond, false_br, true_br); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_pd(cond, false_br, true_br); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_blend_epi64(cond, false_br, true_br); + } + } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512f_256 {}); + } + + // reciprocal + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_ps(self); + } + + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_pd(self); + } + + // bitwise_and + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_not + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + // bitwise_or + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template ::value>> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index e4af24da6..237ffca7d 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -72,6 +72,7 @@ #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" +#include "./xsimd_avx512f_256.hpp" #endif #if XSIMD_WITH_AVX512DQ diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index 5c5f9a5fc..b3962a0b2 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -163,7 +163,7 @@ namespace xsimd using all_x86_architectures = arch_list< avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512cd, avx512f, - avxvnni, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, + avxvnni, avx512f_256, fma3, avx2, fma3, avx, avx2_128, avx_128, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index 3c840c2c5..45fbd959f 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -875,6 +875,8 @@ namespace xsimd inline bool avx512f() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + inline bool avx512f_256() const noexcept { return avx_enabled() && leaf7().all_bits_set(); } + inline bool avx512dq() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool rdseed() const noexcept { return leaf7().all_bits_set(); } diff --git a/include/xsimd/types/xsimd_avx512f_register.hpp b/include/xsimd/types/xsimd_avx512f_register.hpp index 279ae4caa..e30276f26 100644 --- a/include/xsimd/types/xsimd_avx512f_register.hpp +++ b/include/xsimd/types/xsimd_avx512f_register.hpp @@ -13,6 +13,7 @@ #define XSIMD_AVX512F_REGISTER_HPP #include "./xsimd_common_arch.hpp" +#include "./xsimd_fma3_avx2_register.hpp" namespace xsimd { @@ -31,6 +32,18 @@ namespace xsimd static constexpr char const* name() noexcept { return "avx512f"; } }; + /** + * @ingroup architectures + * + * AVX512F instructions extension for 256 bits registers + */ + struct avx512f_256 : fma3 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; } + static constexpr bool available() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx512f/256"; } + }; + #if XSIMD_WITH_AVX512F #if !XSIMD_WITH_AVX2 @@ -70,6 +83,12 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512); XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d); + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512f_256, avx2); } #endif }