From 29fc88d55523ddb1b92bdf3d917788d8af0cba3a Mon Sep 17 00:00:00 2001 From: Dan Mattheiss Date: Sat, 23 May 2026 17:12:13 -0400 Subject: [PATCH 1/3] GH-50026: [C++][Parquet] SIMD-accelerate SBBF probe via branchless autovec Rewrite BlockSplitBloomFilter::FindHash from a short-circuit early-exit loop to a branchless OR-accumulator reduction. The early `return false` blocked compilers from collapsing the 8-lane probe to a horizontal block test; the reduction autovectorizes to a single SSE/NEON block test on clang, gcc, and MSVC. Wire the probe through CpuInfo runtime dispatch, mirroring the existing level_comparison_avx2 pattern. The shared body in bloom_filter_block_inc.h is built once at the baseline (SSE on x86, NEON on aarch64) and once in bloom_filter_avx2.cc compiled with `-mavx2`. The AVX2 TU spells the reduction in xsimd rather than relying on autovec: clang lowers the autovec body to a single vptest, but gcc/MSVC emit a longer horizontal vpor reduction that costs ~20% out-of-L3. xsimd is guaranteed available under ARROW_HAVE_RUNTIME_AVX2. A new cross-target diff test calls both probe bodies directly across 20K random + 200 production-populated blocks per CI run, so neither path can silently drift. A static_assert ties the 8-lane assumption to BlockSplitBloomFilter::kBitsSetPerBlock. On-disk format unchanged. SALT, XXH64, bucket index unchanged. Bit-identical to the scalar reference. End-to-end FindHash perf via parquet/benches/bloom_filter_benchmark.cc. M1 (Apple clang -O3, NEON via autovec, 10 reps, CV<=0.4%): | Bench | upstream/main (scalar) | simd-sbbf-autovec | Speedup | |-------------------------------------|---------------------------|---------------------------|---------| | BM_FindExistingHash (hit-heavy) | 3.85 ns/probe (259.6 M/s) | 2.41 ns/probe (415.1 M/s) | 1.60x | | BM_FindNonExistingHash (miss-heavy) | 9.04 ns/probe (110.6 M/s) | 2.41 ns/probe (415.4 M/s) | 3.75x | x86-64 (gcc 13.3, -O2 -mavx2 via AVX2 dispatch TU, 5 reps, CV<=0.6%): | Bench | upstream/main (scalar) | simd-sbbf-autovec | Speedup | |-------------------------------------|---------------------------|---------------------------|---------| | BM_FindExistingHash (hit-heavy) | 8.62 ns/probe (116.0 M/s) | 4.32 ns/probe (231.6 M/s) | 2.00x | | BM_FindNonExistingHash (miss-heavy) | 15.29 ns/probe (65.4 M/s) | 4.33 ns/probe (230.8 M/s) | 3.53x | The scalar miss path stalls on the data-dependent early-exit (slower than its own hit path on both archs); the branchless reduction is constant-time across hit and miss. Miss-heavy is the common case for Parquet row-group skipping. Insert/ComputeHash/batch paths unchanged (16 benches within +/-0.6%). Cache-regime sweep in the PR description. Insert path uses the same loop shape and follows in a separate PR. --- cpp/src/parquet/CMakeLists.txt | 10 ++- cpp/src/parquet/bloom_filter.cc | 48 +++++++--- cpp/src/parquet/bloom_filter_avx2.cc | 46 ++++++++++ cpp/src/parquet/bloom_filter_avx2_internal.h | 32 +++++++ cpp/src/parquet/bloom_filter_block_inc.h | 43 +++++++++ cpp/src/parquet/bloom_filter_test.cc | 92 ++++++++++++++++++++ 6 files changed, 258 insertions(+), 13 deletions(-) create mode 100644 cpp/src/parquet/bloom_filter_avx2.cc create mode 100644 cpp/src/parquet/bloom_filter_avx2_internal.h create mode 100644 cpp/src/parquet/bloom_filter_block_inc.h diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 07cf9f9c5031..4a7201a53727 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -195,7 +195,11 @@ set(PARQUET_SRCS if(ARROW_HAVE_RUNTIME_AVX2) # AVX2 is used as a proxy for BMI2. - list(APPEND PARQUET_SRCS level_comparison_avx2.cc level_conversion_bmi2.cc) + list(APPEND + PARQUET_SRCS + level_comparison_avx2.cc + level_conversion_bmi2.cc + bloom_filter_avx2.cc) # We need CMAKE_CXX_FLAGS_RELEASE here to prevent the one-definition-rule # violation with -DCMAKE_BUILD_TYPE=MinSizeRel. CMAKE_CXX_FLAGS_RELEASE # will force inlining as much as possible. @@ -205,8 +209,8 @@ if(ARROW_HAVE_RUNTIME_AVX2) separate_arguments(RELEASE_FLAGS NATIVE_COMMAND "${CMAKE_CXX_FLAGS_RELEASE}") list(APPEND AVX2_FLAGS ${RELEASE_FLAGS}) endif() - set_source_files_properties(level_comparison_avx2.cc PROPERTIES COMPILE_OPTIONS - "${AVX2_FLAGS}") + set_source_files_properties(level_comparison_avx2.cc bloom_filter_avx2.cc + PROPERTIES COMPILE_OPTIONS "${AVX2_FLAGS}") # WARNING: DO NOT BLINDLY COPY THIS CODE FOR OTHER BMI2 USE CASES. # This code is always guarded by runtime dispatch which verifies # BMI2 is present. For a very small number of CPUs AVX2 does not diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index 577d26fe0078..e76c62c6851d 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -34,6 +35,35 @@ #include "parquet/thrift_internal.h" #include "parquet/xxhasher.h" +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "parquet/bloom_filter_avx2_internal.h" +#endif + +#define PARQUET_IMPL_NAMESPACE standard +#include "parquet/bloom_filter_block_inc.h" +#undef PARQUET_IMPL_NAMESPACE + +#include "arrow/util/dispatch_internal.h" + +namespace parquet::internal { +namespace { + +using ::arrow::internal::DynamicDispatch; + +struct FindHashBlockDynamicFunction { + using FunctionType = decltype(&standard::FindHashBlockImpl); + + static constexpr auto targets() { + return std::array{ + ARROW_DISPATCH_TARGET_NONE(&standard::FindHashBlockImpl) // + ARROW_DISPATCH_TARGET_AVX2(&FindHashBlockAvx2) // + }; + } +}; + +} // namespace +} // namespace parquet::internal + namespace parquet { namespace { @@ -346,20 +376,18 @@ void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const { } bool BlockSplitBloomFilter::FindHash(uint64_t hash) const { + // Probe kernels in bloom_filter_block_inc.h and bloom_filter_avx2.cc both + // hard-code an 8-lane (256-bit) block. + static_assert(kBitsSetPerBlock == 8, + "SBBF probe kernels assume 8 bits set per 256-bit block"); const uint32_t bucket_index = static_cast(((hash >> 32) * (num_bytes_ / kBytesPerFilterBlock)) >> 32); const uint32_t key = static_cast(hash); const uint32_t* bitset32 = reinterpret_cast(data_->data()); - - for (int i = 0; i < kBitsSetPerBlock; ++i) { - // Calculate mask for key in the given bitset. - const uint32_t mask = UINT32_C(0x1) << ((key * SALT[i]) >> 27); - if (ARROW_PREDICT_FALSE(0 == - (bitset32[kBitsSetPerBlock * bucket_index + i] & mask))) { - return false; - } - } - return true; + const uint32_t* block = bitset32 + kBitsSetPerBlock * bucket_index; + static ::arrow::internal::DynamicDispatch + dispatch; + return dispatch(block, SALT, key); } void BlockSplitBloomFilter::InsertHashImpl(uint64_t hash) { diff --git a/cpp/src/parquet/bloom_filter_avx2.cc b/cpp/src/parquet/bloom_filter_avx2.cc new file mode 100644 index 000000000000..4eca1bf4e6b9 --- /dev/null +++ b/cpp/src/parquet/bloom_filter_avx2.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/bloom_filter_avx2_internal.h" + +namespace parquet::internal { + +// Parquet SBBF probe (256-bit block, 8 SALT-derived bits per probe). Unrelated +// to cpp/src/arrow/acero/bloom_filter_avx2.cc -- Acero's blocked bloom filter +// is a different algorithm (64-bit block, ~4 bits per probe, in-memory only) +// and the two kernels are not interchangeable. See the Parquet spec for the +// SBBF on-disk layout this kernel must match. +// +// Spelled in xsimd rather than reusing the autovectorized body in +// bloom_filter_block_inc.h: only clang lowers that body to a single vptest; +// gcc and MSVC emit a longer horizontal vpor reduction. +bool FindHashBlockAvx2(const uint32_t* block, const uint32_t* salt, uint32_t key) { + using batch = xsimd::batch; + const batch mask = batch(uint32_t{1}) + << ((batch(key) * batch::load_unaligned(salt)) >> 27); + const batch miss = xsimd::bitwise_andnot(mask, batch::load_unaligned(block)); + // `miss != 0` (one extra vpcmpeqd) is deliberate: reinterpreting `miss` + // directly as a batch_bool would skip the compare but feed non-canonical + // lane values into batch_bool, which relies on xsimd's AVX2 backend + // lowering none() to a whole-register vptest. That lowering is not part + // of xsimd's documented contract. + return xsimd::none(miss != batch(uint32_t{0})); +} + +} // namespace parquet::internal diff --git a/cpp/src/parquet/bloom_filter_avx2_internal.h b/cpp/src/parquet/bloom_filter_avx2_internal.h new file mode 100644 index 000000000000..9291f5bf5ece --- /dev/null +++ b/cpp/src/parquet/bloom_filter_avx2_internal.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "parquet/platform.h" + +namespace parquet::internal { + +// PARQUET_EXPORT so the symbol is visible from parquet_shared on Windows MinGW +// (default visibility is hidden) -- the cross-target diff test calls this +// directly. +PARQUET_EXPORT bool FindHashBlockAvx2(const uint32_t* block, const uint32_t* salt, + uint32_t key); + +} // namespace parquet::internal diff --git a/cpp/src/parquet/bloom_filter_block_inc.h b/cpp/src/parquet/bloom_filter_block_inc.h new file mode 100644 index 000000000000..c8f161e299dc --- /dev/null +++ b/cpp/src/parquet/bloom_filter_block_inc.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once + +#include + +// Used to make sure ODR rule isn't violated. +#ifndef PARQUET_IMPL_NAMESPACE +# error "PARQUET_IMPL_NAMESPACE must be defined" +#endif + +namespace parquet::internal::PARQUET_IMPL_NAMESPACE { + +// Branchless OR-accumulator reduction: the short-circuit `return false` shape +// blocks the compiler from collapsing the 8-lane probe into a single +// horizontal block test. This shape autovectorizes to SSE/NEON at the +// baseline; AVX2 has its own xsimd kernel (see bloom_filter_avx2.cc) because +// gcc/MSVC don't lower this reduction to a single vptest. +inline bool FindHashBlockImpl(const uint32_t* block, const uint32_t* salt, uint32_t key) { + constexpr int kBitsSetPerBlock = 8; + uint32_t miss = 0; + for (int i = 0; i < kBitsSetPerBlock; ++i) { + const uint32_t mask = static_cast(1) << ((key * salt[i]) >> 27); + miss |= (~block[i] & mask); + } + return miss == 0; +} + +} // namespace parquet::internal::PARQUET_IMPL_NAMESPACE diff --git a/cpp/src/parquet/bloom_filter_test.cc b/cpp/src/parquet/bloom_filter_test.cc index ff83b9730227..ca84b9b35270 100644 --- a/cpp/src/parquet/bloom_filter_test.cc +++ b/cpp/src/parquet/bloom_filter_test.cc @@ -30,6 +30,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/util/cpu_info.h" #include "parquet/bloom_filter.h" #include "parquet/exception.h" @@ -38,6 +39,16 @@ #include "parquet/types.h" #include "parquet/xxhasher.h" +// Both dispatch targets included directly so the test can exercise the +// un-picked one too -- DynamicDispatch resolves once at static init. +#define PARQUET_IMPL_NAMESPACE standard +#include "parquet/bloom_filter_block_inc.h" +#undef PARQUET_IMPL_NAMESPACE + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "parquet/bloom_filter_avx2_internal.h" +#endif + namespace parquet { namespace test { @@ -434,5 +445,86 @@ TYPED_TEST(TestBatchBloomFilter, Basic) { AssertBufferEqual(*buffer, *batch_insert_buffer); } +// Guards against silent drift between the baseline and AVX2 probe bodies -- +// DynamicDispatch only runs one of them per host. +#if defined(ARROW_HAVE_RUNTIME_AVX2) +namespace { + +// 8-lane block matches BlockSplitBloomFilter's hard-coded shape; declared +// locally so the test doesn't depend on the class's private constants. +constexpr int kProbeBlockLanes = 8; + +// Test-only SALT (matches the Parquet SBBF spec values used in +// bloom_filter.h). Kernel-vs-kernel agreement holds for any SALT, so this +// duplication is a contained test-side convenience, not a spec mirror. +alignas(32) constexpr uint32_t kProbeTestSalt[kProbeBlockLanes] = { + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, + 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U, +}; + +inline void InsertIntoBlock(uint32_t* block, uint32_t key) { + for (int i = 0; i < kProbeBlockLanes; ++i) { + block[i] |= uint32_t{1} << ((key * kProbeTestSalt[i]) >> 27); + } +} + +void AssertKernelsAgree(const uint32_t* block, uint32_t key) { + const bool standard = internal::standard::FindHashBlockImpl(block, kProbeTestSalt, key); + const bool avx2 = internal::FindHashBlockAvx2(block, kProbeTestSalt, key); + ASSERT_EQ(standard, avx2) << "dispatch targets diverged for key=0x" << std::hex << key; +} + +} // namespace + +class BloomFilterProbeKernel : public ::testing::Test { + protected: + void SetUp() override { + if (!::arrow::internal::CpuInfo::GetInstance()->IsSupported( + ::arrow::internal::CpuInfo::AVX2)) { + GTEST_SKIP() << "AVX2 not available at runtime"; + } + } +}; + +// Random-block fuzz: exercises the full bit lattice, catches reduction / +// operand-order bugs that don't depend on realistic fill density. +TEST_F(BloomFilterProbeKernel, AgreeOnRandomBlocks) { + std::mt19937_64 rng(0xC0FFEE); + constexpr int kNumTrials = 20000; + for (int trial = 0; trial < kNumTrials; ++trial) { + alignas(32) uint32_t block[kProbeBlockLanes]; + for (uint32_t& word : block) { + word = static_cast(rng()); + } + AssertKernelsAgree(block, static_cast(rng())); + } +} + +// Production-fill fuzz: blocks populated by the same SALT-derived insert the +// writer uses, then probed with both inserted keys (must match) and fresh +// keys (mostly miss). Catches bugs that only surface on real fill density. +TEST_F(BloomFilterProbeKernel, AgreeOnPopulatedBlocks) { + std::mt19937_64 rng(0xBABECAFE); + constexpr int kNumBlocks = 200; + constexpr int kKeysPerBlock = 6; // ~k inserts per 256-bit block, realistic FPP. + for (int b = 0; b < kNumBlocks; ++b) { + alignas(32) uint32_t block[kProbeBlockLanes] = {0}; + std::vector inserted; + inserted.reserve(kKeysPerBlock); + for (int k = 0; k < kKeysPerBlock; ++k) { + const uint32_t key = static_cast(rng()); + InsertIntoBlock(block, key); + inserted.push_back(key); + } + for (uint32_t key : inserted) { + AssertKernelsAgree(block, key); + } + for (int q = 0; q < 50; ++q) { + AssertKernelsAgree(block, static_cast(rng())); + } + } +} +#endif + } // namespace test } // namespace parquet From 5a99d504a6455f608ae25db4ff9eb627a4323e25 Mon Sep 17 00:00:00 2001 From: dmatth1 Date: Sun, 24 May 2026 21:03:45 -0400 Subject: [PATCH 2/3] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- cpp/src/parquet/bloom_filter.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index e76c62c6851d..d5238c204b43 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -56,7 +56,9 @@ struct FindHashBlockDynamicFunction { static constexpr auto targets() { return std::array{ ARROW_DISPATCH_TARGET_NONE(&standard::FindHashBlockImpl) // +#if defined(ARROW_HAVE_RUNTIME_AVX2) ARROW_DISPATCH_TARGET_AVX2(&FindHashBlockAvx2) // +#endif }; } }; From 15912950bfd733c00f4918c6cb37a76f45bd9bc9 Mon Sep 17 00:00:00 2001 From: Dan Mattheiss Date: Fri, 29 May 2026 07:36:32 -0400 Subject: [PATCH 3/3] Address PR feedback Dispatch guard, generic xsimd batch + bitwise_rshift, comment cleanups. --- cpp/src/parquet/bloom_filter.cc | 4 +--- cpp/src/parquet/bloom_filter_avx2.cc | 7 ++++--- cpp/src/parquet/bloom_filter_avx2_internal.h | 3 --- cpp/src/parquet/bloom_filter_block_inc.h | 5 ----- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index d5238c204b43..aa8106c4af12 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -35,7 +35,7 @@ #include "parquet/thrift_internal.h" #include "parquet/xxhasher.h" -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2) # include "parquet/bloom_filter_avx2_internal.h" #endif @@ -56,9 +56,7 @@ struct FindHashBlockDynamicFunction { static constexpr auto targets() { return std::array{ ARROW_DISPATCH_TARGET_NONE(&standard::FindHashBlockImpl) // -#if defined(ARROW_HAVE_RUNTIME_AVX2) ARROW_DISPATCH_TARGET_AVX2(&FindHashBlockAvx2) // -#endif }; } }; diff --git a/cpp/src/parquet/bloom_filter_avx2.cc b/cpp/src/parquet/bloom_filter_avx2.cc index 4eca1bf4e6b9..fd5e8c2a83e4 100644 --- a/cpp/src/parquet/bloom_filter_avx2.cc +++ b/cpp/src/parquet/bloom_filter_avx2.cc @@ -31,9 +31,10 @@ namespace parquet::internal { // bloom_filter_block_inc.h: only clang lowers that body to a single vptest; // gcc and MSVC emit a longer horizontal vpor reduction. bool FindHashBlockAvx2(const uint32_t* block, const uint32_t* salt, uint32_t key) { - using batch = xsimd::batch; - const batch mask = batch(uint32_t{1}) - << ((batch(key) * batch::load_unaligned(salt)) >> 27); + using batch = xsimd::batch; + const batch mask = + batch(uint32_t{1}) + << xsimd::bitwise_rshift<27>(batch(key) * batch::load_unaligned(salt)); const batch miss = xsimd::bitwise_andnot(mask, batch::load_unaligned(block)); // `miss != 0` (one extra vpcmpeqd) is deliberate: reinterpreting `miss` // directly as a batch_bool would skip the compare but feed non-canonical diff --git a/cpp/src/parquet/bloom_filter_avx2_internal.h b/cpp/src/parquet/bloom_filter_avx2_internal.h index 9291f5bf5ece..7966d137ef5f 100644 --- a/cpp/src/parquet/bloom_filter_avx2_internal.h +++ b/cpp/src/parquet/bloom_filter_avx2_internal.h @@ -23,9 +23,6 @@ namespace parquet::internal { -// PARQUET_EXPORT so the symbol is visible from parquet_shared on Windows MinGW -// (default visibility is hidden) -- the cross-target diff test calls this -// directly. PARQUET_EXPORT bool FindHashBlockAvx2(const uint32_t* block, const uint32_t* salt, uint32_t key); diff --git a/cpp/src/parquet/bloom_filter_block_inc.h b/cpp/src/parquet/bloom_filter_block_inc.h index c8f161e299dc..804a57cba95c 100644 --- a/cpp/src/parquet/bloom_filter_block_inc.h +++ b/cpp/src/parquet/bloom_filter_block_inc.h @@ -25,11 +25,6 @@ namespace parquet::internal::PARQUET_IMPL_NAMESPACE { -// Branchless OR-accumulator reduction: the short-circuit `return false` shape -// blocks the compiler from collapsing the 8-lane probe into a single -// horizontal block test. This shape autovectorizes to SSE/NEON at the -// baseline; AVX2 has its own xsimd kernel (see bloom_filter_avx2.cc) because -// gcc/MSVC don't lower this reduction to a single vptest. inline bool FindHashBlockImpl(const uint32_t* block, const uint32_t* salt, uint32_t key) { constexpr int kBitsSetPerBlock = 8; uint32_t miss = 0;