From 35c7d3af6fd31fb9cb5a9f824df24015ab4a2efd Mon Sep 17 00:00:00 2001 From: nez0b Date: Mon, 15 Jun 2026 20:25:26 +0800 Subject: [PATCH 1/2] Optimise getValueOfBits and insertBits with BMI2 PEXT/PDEP (#717) Replace the per-amplitude looped bit gather/scatter in the CPU statevector/ density-matrix kernels with x86 BMI2 PEXT/PDEP, hoisting the loop-invariant masks out of the 2^N loops so each per-amplitude call becomes one instruction: - insertBitsWithMaskedValues sites: compute the position mask once per gate and use _pdep_u64 (order-invariant scatter; unconditionally correct). - getValueOfBits sites: _pext_u64 when the qubits are strictly increasing, falling back to the original scalar loop otherwise (order is preserved). Portability: BMI2 is opt-in via a new QUEST_ENABLE_BMI2 CMake option, OFF by default, so a default build stays portable scalar (no BMI2 in the binary, no SIGILL on pre-BMI2 CPUs). Enabling it wires -mbmi2 through the library; the intrinsics are additionally guarded to x86 host TUs (never CUDA/HIP device code). The scalar fallback is byte-identical, and QUEST_BITWISE_FORCE_SCALAR forces it on a BMI2-capable host. Tests/benchmark: tests/unit/bitwise.cpp asserts the new helpers are bit-identical to the originals over exhaustive-small, randomised, and boundary inputs (bits 31/32/61/62/63 incl. the int64 sign bit); examples/automated adds a cross-platform benchmark that prints timings and which path was compiled in. Bit-identical to the scalar path (verified by unit tests, the QuEST suite for the touched kernels, and amplitude hashes of QFT/random/Grover/VQE circuits). Closes #717 --- CMakeLists.txt | 31 +++- examples/automated/CMakeLists.txt | 12 ++ examples/automated/benchmark_bitwise_bmi2.cpp | 153 +++++++++++++++++ quest/src/core/bitwise.hpp | 54 +++++- quest/src/cpu/cpu_subroutines.cpp | 74 +++++--- tests/unit/CMakeLists.txt | 19 ++- tests/unit/bitwise.cpp | 161 ++++++++++++++++++ 7 files changed, 477 insertions(+), 27 deletions(-) create mode 100644 examples/automated/benchmark_bitwise_bmi2.cpp create mode 100644 tests/unit/bitwise.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b5a438713..c01b2bfcd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,13 @@ option( ) message(STATUS "NUMA awareness is turned ${QUEST_ENABLE_NUMA}. Set QUEST_ENABLE_NUMA to modify.") +option( + QUEST_ENABLE_BMI2 + "Whether QuEST will accelerate CPU bit gather/scatter with x86 BMI2 (PEXT/PDEP) intrinsics (issue #717). Turned OFF by default; when ON, the resulting binary requires a BMI2-capable CPU at runtime." + OFF +) +message(STATUS "BMI2 bitwise acceleration is turned ${QUEST_ENABLE_BMI2}. Set QUEST_ENABLE_BMI2 to modify.") + # Distribution option( @@ -402,13 +409,35 @@ else() set(WARNING_FLAG -Wall) endif() -target_compile_options(QuEST +target_compile_options(QuEST PRIVATE $<$:${WARNING_FLAG}> $<$:${WARNING_FLAG}> ) +# ================================================== +# CPU bit-manipulation acceleration (BMI2, issue #717) +# ================================================== +# The PEXT/PDEP fast paths in quest/src/core/bitwise.hpp are guarded by `#if defined(__BMI2__)`, +# which the compiler only defines when BMI2 codegen is enabled. We add -mbmi2 ONLY when the user opts +# in via QUEST_ENABLE_BMI2 (OFF by default), so a default build stays portable and runs on any x86 CPU +# (it compiles the byte-identical scalar fallback). Without the opt-in, -mbmi2 is never added, so the +# library is free of BMI2 instructions and cannot SIGILL on a pre-BMI2 CPU. The generator expression +# scopes the flag to C++ host translation units, so CUDA/HIP device compilation is unaffected (and the +# intrinsics are additionally #ifdef-guarded against __CUDA_ARCH__/__HIP_DEVICE_COMPILE__). A user who +# instead supplies their own -march=native still gets the fast path on their own CPU. +if (QUEST_ENABLE_BMI2) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mbmi2" QUEST_COMPILER_SUPPORTS_MBMI2) + if (QUEST_COMPILER_SUPPORTS_MBMI2) + target_compile_options(QuEST PRIVATE $<$:-mbmi2>) + else() + message(WARNING "QUEST_ENABLE_BMI2=ON but the compiler does not accept -mbmi2; building the scalar fallback.") + endif() +endif() + + # ============================ # Link optional dependencies diff --git a/examples/automated/CMakeLists.txt b/examples/automated/CMakeLists.txt index 5880c2ac0..e69169ed3 100644 --- a/examples/automated/CMakeLists.txt +++ b/examples/automated/CMakeLists.txt @@ -1,3 +1,15 @@ # @author Tyson Jones add_all_local_examples() + +# The issue-#717 bitwise micro-benchmark builds with -mbmi2 (so its PEXT/PDEP path is enabled) only +# when the user opts in via QUEST_ENABLE_BMI2 — same switch the library uses. Without the opt-in it +# compiles the scalar fallback and prints "BMI2 fast path: INACTIVE" (never SIGILLs). add_example() +# names the target _; the flag is scoped to this one target. +if (QUEST_ENABLE_BMI2 AND TARGET benchmark_bitwise_bmi2_cpp) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mbmi2" QUEST_EXAMPLE_SUPPORTS_MBMI2) + if (QUEST_EXAMPLE_SUPPORTS_MBMI2) + target_compile_options(benchmark_bitwise_bmi2_cpp PRIVATE -mbmi2) + endif() +endif() diff --git a/examples/automated/benchmark_bitwise_bmi2.cpp b/examples/automated/benchmark_bitwise_bmi2.cpp new file mode 100644 index 000000000..9f0cbc826 --- /dev/null +++ b/examples/automated/benchmark_bitwise_bmi2.cpp @@ -0,0 +1,153 @@ +/** @file + * A quick, self-contained micro-benchmark of the BMI2 PEXT/PDEP fast paths added for issue #717, + * comparing them against the original scalar bit gather/scatter loops. It prints per-call timings + * so QuEST's CI can compare the speedup across its tested platforms and compilers. + * + * The two scalar routines below mirror getValueOfBits() and insertBitsWithMaskedValues() from + * quest/src/core/bitwise.hpp; the BMI2 routines are the single-instruction _pext_u64 / _pdep_u64 + * paths. This file deliberately depends on nothing but the C++ standard library (and + * when targeting x86 BMI2), so it compiles and runs on every platform — emitting the scalar + * timings alone where BMI2 is unavailable, never raising SIGILL. + * + * Build note: this target is compiled with -mbmi2 (see examples/automated/CMakeLists.txt) so the + * intrinsic path is enabled; the QuEST library itself enables -mbmi2 the same way in the top-level + * CMakeLists.txt. Whether the fast path was compiled in is printed at runtime. + * + * @author (issue #717 contribution) + */ + +#include +#include +#include + +#if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)) + #include + #define BENCH_USE_BMI2 +#endif + +using std::uint64_t; + +// --- scalar references (mirroring quest/src/core/bitwise.hpp) ------------------------------------- + +// getValueOfBits: gather the bits at the given (strictly increasing) positions into the low bits. +static inline uint64_t scalarGather(uint64_t number, const int* inds, int n) { + uint64_t value = 0; + for (int i=0; i> inds[i]) & 1ULL) << i; + return value; +} + +// insertBitsWithMaskedValues: spread number's low bits into the positions NOT named by inds (i.e. +// insert a 0 at each increasing index), then OR in the precomputed value mask. +static inline uint64_t scalarScatter(uint64_t number, const int* inds, int n, uint64_t valueMask) { + uint64_t r = number; + for (int i=0; i +static double timeMin(uint64_t iters, int reps, F&& fn) { + double best = 1e300; + for (int r=0; r(t1 - t0).count(); + if (s < best) best = s; + } + return best; +} + +int main() { + + printf("QuEST issue #717 - BMI2 PEXT/PDEP bitwise micro-benchmark\n"); +#ifdef BENCH_USE_BMI2 + printf("BMI2 fast path: ACTIVE (compiled with -mbmi2)\n\n"); +#else + printf("BMI2 fast path: INACTIVE (x86 BMI2 not targeted; scalar timings only)\n\n"); +#endif + + const uint64_t iters = 8000000; // keeps total runtime well under a second + const int reps = 3; + const int counts[] = {3, 6}; // representative qubit-arity per gate + + printf("%-8s %-4s %14s %14s %10s\n", "op", "k", "scalar ns/call", "bmi2 ns/call", "speedup"); + + for (int ci=0; ci<2; ci++) { + int k = counts[ci]; + + // a fixed, strictly-increasing index set and a value mask consistent with it + int inds[8]; + for (int i=0; i #endif +// Optional BMI2 PEXT/PDEP fast paths for the bit gather/scatter helpers below (issue #717). +// Active only when BMI2 is actually targeted (__BMI2__), i.e. when the build opts in with +// -DQUEST_ENABLE_BMI2=ON or the user supplies their own -march=native; a default build defines no +// such flag and compiles the byte-identical scalar fallback, so it stays portable. Restricted to x86 +// host compilation (never CUDA/HIP device code, where INLINE becomes __device__). Define +// QUEST_BITWISE_FORCE_SCALAR to force the scalar path even on a BMI2-capable host. +#if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)) \ + && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(QUEST_BITWISE_FORCE_SCALAR) + #include + #define QUEST_BITWISE_USE_BMI2 +#endif + #include "quest/include/types.h" #include "quest/src/core/inliner.hpp" - - /* * PERFORMANCE-CRITICAL FUNCTIONS * @@ -212,6 +222,46 @@ INLINE qindex insertBitsWithMaskedValues(qindex number, const int* bitInds, int } +/* + * Mask-accepting variants of the bit gather/scatter helpers (issue #717). + * + * The caller computes the loop-invariant POSITION mask (a bit set at every index in bitInds) + * once, before the exponentially-large statevector loop, e.g. + * qindex posMask = getBitMask(sortedInds.data(), numInds); + * so each per-amplitude call collapses to a single PDEP/PEXT instruction instead of an + * O(numBits) loop. bitInds/numBits are retained so that, when BMI2 is unavailable, the fallback + * reuses the original unrolled scalar routines and stays byte-identical. + */ + +INLINE qindex insertBitsWithMaskedValuesAndPosMask(qindex number, qindex valueMask, [[maybe_unused]] qindex posMask, [[maybe_unused]] const int* bitInds, [[maybe_unused]] int numBits) { +#ifdef QUEST_BITWISE_USE_BMI2 + return valueMask | (qindex) _pdep_u64((unsigned long long) number, ~ (unsigned long long) posMask); +#else + return valueMask | insertBits(number, bitInds, numBits, 0); +#endif +} + +INLINE qindex getValueOfBitsFromSortedPosMask(qindex number, [[maybe_unused]] qindex posMask, [[maybe_unused]] const int* bitInds, [[maybe_unused]] int numBits) { + // PEXT emits the gathered bits in ascending position order, so this matches getValueOfBits + // only when bitInds are strictly increasing. The caller checks that once per gate (see + // isStrictlyIncreasing) and supplies posMask = getBitMask(bitInds, numBits). +#ifdef QUEST_BITWISE_USE_BMI2 + return (qindex) _pext_u64((unsigned long long) number, (unsigned long long) posMask); +#else + return getValueOfBits(number, bitInds, numBits); +#endif +} + +// Checked once per gate (loop-invariant), never per amplitude: getValueOfBits is order-sensitive, +// so the PEXT path above is valid only when bitInds are strictly increasing. +INLINE bool isStrictlyIncreasing(const int* bitInds, int numBits) { + for (int i=1; i= bitInds[i]) + return false; + return true; +} + + INLINE int getTwoBits(qindex number, int highInd, int lowInd) { int b1 = getBit(number, lowInd); diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp index 59df946e9..ec8621021 100644 --- a/quest/src/cpu/cpu_subroutines.cpp +++ b/quest/src/cpu/cpu_subroutines.cpp @@ -236,11 +236,12 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const // use template param to compile-time unroll loop in insertBits() SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size()); + qindex qubitsPosMask = getBitMask(sortedQubitInds.data(), numBits); // loop-invariant: hoisted out of the per-amplitude loop #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n cache(numTargAmps); + qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits); // loop-invariant: hoisted out of the per-amplitude loop #pragma omp for for (qindex n=0; n= cpu_getAvailableNumThreads()) { // parallel + qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits); // loop-invariant: hoisted out of the per-amplitude loop #pragma omp parallel for if(qureg.isMultithreaded) for (qindex n=0; n + +#include +#include +#include + +namespace { + + // k distinct indices in [0, maxBit), returned strictly increasing + std::vector randomIncreasingInds(std::mt19937_64& rng, int k, int maxBit) { + std::vector pool(maxBit); + for (int i=0; i inds(pool.begin(), pool.begin() + k); + std::sort(inds.begin(), inds.end()); + return inds; + } +} + +TEST_CASE( "issue #717 helpers compiled path", "[bitwise]" ) { + + // surfaced in the CI log so it is clear which path these tests exercised +#ifdef QUEST_BITWISE_USE_BMI2 + WARN( "bitwise helpers compiled with BMI2 PEXT/PDEP enabled" ); +#else + WARN( "bitwise helpers compiled with the scalar fallback (BMI2 not targeted)" ); +#endif + SUCCEED(); +} + +TEST_CASE( "getValueOfBitsFromSortedPosMask matches getValueOfBits", "[bitwise]" ) { + + std::mt19937_64 rng(0x717ULL); + + for (int k=0; k<=12; k++) { + for (int trial=0; trial<200; trial++) { + + std::vector inds = (k==0) + ? std::vector{} + : randomIncreasingInds(rng, k, 50); + + qindex posMask = getBitMask(inds.data(), k); + + for (int s=0; s<8; s++) { + qindex number = (qindex) (rng() & ((1ULL<<50) - 1)); // bits live in [0,50) + REQUIRE( + getValueOfBitsFromSortedPosMask(number, posMask, inds.data(), k) == + getValueOfBits(number, inds.data(), k) ); + } + } + } +} + +TEST_CASE( "insertBitsWithMaskedValuesAndPosMask matches insertBitsWithMaskedValues", "[bitwise]" ) { + + std::mt19937_64 rng(0x718ULL); + + for (int k=0; k<=12; k++) { + for (int trial=0; trial<200; trial++) { + + std::vector inds = (k==0) + ? std::vector{} + : randomIncreasingInds(rng, k, 50); + + qindex posMask = getBitMask(inds.data(), k); + + // per the original contract, the value mask is zero except at the inserted positions + qindex valueMask = ((qindex) rng()) & posMask; + + for (int s=0; s<8; s++) { + qindex number = (qindex) (rng() & ((1ULL<<40) - 1)); // avoid shifting bits past bit 63 + REQUIRE( + insertBitsWithMaskedValuesAndPosMask(number, valueMask, posMask, inds.data(), k) == + insertBitsWithMaskedValues(number, inds.data(), k, valueMask) ); + } + } + } +} + +TEST_CASE( "helpers match at boundary bit positions", "[bitwise]" ) { + + // Deterministic coverage of the awkward positions the randomised tests above never reach: + // the 32-bit word boundary (31/32) and the high bits 61/62/63 — bit 63 being the sign bit of the + // signed qindex, where the scalar (arithmetic-shift) and BMI2 (unsigned PEXT/PDEP) paths are most + // likely to disagree if anything is wrong. + const std::vector> indexSets = { + {31}, {32}, {63}, {31, 32}, {62, 63}, {0, 63}, + {0, 31, 32, 63}, {30, 31, 32, 33}, {59, 60, 61, 62, 63}, + }; + const std::vector numbers = { + 0ULL, + ~0ULL, // all bits set + 1ULL << 63, // only the sign bit + (1ULL << 63) | 1ULL, // sign bit + bit 0 + 0x00000000FFFFFFFFULL, // low 32 + 0xFFFFFFFF00000000ULL, // high 32 + (1ULL << 31) | (1ULL << 32), // straddle the word boundary + 0xAAAAAAAAAAAAAAAAULL, // alternating + 0x5555555555555555ULL, + }; + + for (const auto& inds : indexSets) { + int k = (int) inds.size(); + qindex posMask = getBitMask(inds.data(), k); + + for (unsigned long long raw : numbers) { + + // gather: any 64-bit input is valid (reads bits, incl. bit 63 of a negative qindex) + qindex g = (qindex) raw; + REQUIRE( + getValueOfBitsFromSortedPosMask(g, posMask, inds.data(), k) == + getValueOfBits(g, inds.data(), k) ); + + // insert: keep the input within its low (64-k) significant bits so the scalar reference + // is well-defined (no shift past bit 63); still lets a high input bit land on position 63. + unsigned long long fitMask = (k == 0) ? ~0ULL : ((1ULL << (64 - k)) - 1); + qindex n = (qindex) (raw & fitMask); + for (qindex valueMask : { (qindex) 0, (qindex) (g & posMask) }) { + REQUIRE( + insertBitsWithMaskedValuesAndPosMask(n, valueMask, posMask, inds.data(), k) == + insertBitsWithMaskedValues(n, inds.data(), k, valueMask) ); + } + } + } +} + +TEST_CASE( "isStrictlyIncreasing detects order", "[bitwise]" ) { + + int sorted[] = {0, 2, 5, 9}; + int equalAdj[] = {0, 2, 2, 9}; + int decreasing[] = {9, 5, 2, 0}; + + REQUIRE( isStrictlyIncreasing(sorted, 4) ); + REQUIRE_FALSE( isStrictlyIncreasing(equalAdj, 4) ); + REQUIRE_FALSE( isStrictlyIncreasing(decreasing, 4) ); + + // trivially ordered for 0 or 1 elements + REQUIRE( isStrictlyIncreasing(sorted, 1) ); + REQUIRE( isStrictlyIncreasing(sorted, 0) ); +} From ce7c3ae0b839de7bf131baaebe117d570aee5389 Mon Sep 17 00:00:00 2001 From: Tyson Jones Date: Mon, 15 Jun 2026 15:20:57 -0400 Subject: [PATCH 2/2] Tailor CI --- .github/workflows/compile.yml | 62 +++++++++++++++++++-------------- .github/workflows/test_free.yml | 12 ++++--- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index c86de84f1..c66be8dbe 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -23,6 +23,10 @@ name: compile +### DEBUG +### disabled all but single-CPU + + on: push: branches: @@ -60,14 +64,14 @@ jobs: # compile QuEST with all combinations of below flags matrix: - os: [windows-latest, ubuntu-latest, macos-latest] - precision: [1, 2, 4] - omp: [ON, OFF] - mpi: [ON, OFF] - cuda: [ON, OFF] - hip: [ON, OFF] - cuquantum: [ON, OFF] - mpilib: ['', 'mpich', 'ompi', 'impi', 'msmpi'] + os: [windows-latest, ubuntu-latest, macos-latest, macos-15-intel, macos-26-intel] + precision: [2] #[1, 2, 4] + omp: [OFF] #[ON, OFF] + mpi: [OFF] #[ON, OFF] + cuda: [OFF] #[ON, OFF] + hip: [OFF] #[ON, OFF] + cuquantum: [OFF] #[ON, OFF] + mpilib: [''] #['', 'mpich', 'ompi', 'impi', 'msmpi'] # disable deprecated API on MSVC, and assign unique compilers, # so that we can concisely consult e.g. matrix.compiler=='cl' @@ -240,7 +244,7 @@ jobs: run: > cmake -B ${{ env.build_dir }} -DQUEST_BUILD_EXAMPLES=ON - -DQUEST_BUILD_TESTS=ON + -DQUEST_BUILD_TESTS=OFF -DQUEST_FLOAT_PRECISION=${{ matrix.precision }} -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.deprecated }} -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }} @@ -260,24 +264,24 @@ jobs: # run all compiled isolated examples to test for link-time errors, # continuing if any fail (since some deliberately fail) - - name: Run isolated examples (Windows) - if: ${{ matrix.os == 'windows-latest' }} - working-directory: ${{ env.isolated_dir }}/Release/ - shell: pwsh - run: | - Get-ChildItem -Filter '*.exe' -File | - ForEach-Object { - Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" - & $_.FullName - } - - name: Run isolated examples (Unix) - if: ${{ matrix.os != 'windows-latest' }} - working-directory: ${{ env.isolated_dir }} - run: | - for fn in *_c *_cpp; do - printf "\n[[[ $fn ]]]\n" - ./$fn || true - done + # - name: Run isolated examples (Windows) + # if: ${{ matrix.os == 'windows-latest' }} + # working-directory: ${{ env.isolated_dir }}/Release/ + # shell: pwsh + # run: | + # Get-ChildItem -Filter '*.exe' -File | + # ForEach-Object { + # Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" + # & $_.FullName + # } + # - name: Run isolated examples (Unix) + # if: ${{ matrix.os != 'windows-latest' }} + # working-directory: ${{ env.isolated_dir }} + # run: | + # for fn in *_c *_cpp; do + # printf "\n[[[ $fn ]]]\n" + # ./$fn || true + # done # run all compiled 'automated' examples - name: Run automated examples (Windows) @@ -289,6 +293,10 @@ jobs: ForEach-Object { Write-Host "`r`n[[[ $($_.Name) ]]]`r`n" & $_.FullName + if ($LASTEXITCODE -ne 0) { + Write-Warning "$($_.Name) exited with code $LASTEXITCODE" + $global:LASTEXITCODE = 0 + } } - name: Run automated examples (Unix) if: ${{ matrix.os != 'windows-latest' }} diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml index 2d332e842..f6c20e1dd 100644 --- a/.github/workflows/test_free.yml +++ b/.github/workflows/test_free.yml @@ -10,6 +10,10 @@ name: test (free, serial) +### DEBUG +### disabled all but single-CPU + + on: push: branches: @@ -27,7 +31,7 @@ jobs: # excluding the v4 integration tests, for free serial-unit-test: name: > - ${{ matrix.os == 'ubuntu-latest' && 'Linux' || matrix.os == 'macos-latest' && 'MacOS' || 'Windows' }} + ${{ matrix.os == 'ubuntu-latest' && 'Linux' || startsWith(matrix.os, 'macos') && 'MacOS' || 'Windows' }} [${{ matrix.precision }}] serial unit v${{ matrix.version }} @@ -40,9 +44,9 @@ jobs: # we will compile QuEST with all precisions but no parallelisation matrix: - os: [ubuntu-latest, macos-latest, windows-latest] - version: [3, 4] - precision: [1, 2, 4] + os: [ubuntu-latest, macos-latest, windows-latest, macos-15-intel, macos-26-intel] + version: [4] # [3, 4] + precision: [2] # [1, 2, 4] # MSVC cannot compile deprecated v3 tests exclude: