From 35c7d3af6fd31fb9cb5a9f824df24015ab4a2efd Mon Sep 17 00:00:00 2001
From: nez0b <pwang529@gmail.com>
Date: Mon, 15 Jun 2026 20:25:26 +0800
Subject: [PATCH 1/2] Optimise getValueOfBits and insertBits with BMI2
 PEXT/PDEP (#717)

Replace the per-amplitude looped bit gather/scatter in the CPU statevector/
density-matrix kernels with x86 BMI2 PEXT/PDEP, hoisting the loop-invariant
masks out of the 2^N loops so each per-amplitude call becomes one instruction:

- insertBitsWithMaskedValues sites: compute the position mask once per gate and
  use _pdep_u64 (order-invariant scatter; unconditionally correct).
- getValueOfBits sites: _pext_u64 when the qubits are strictly increasing,
  falling back to the original scalar loop otherwise (order is preserved).

Portability: BMI2 is opt-in via a new QUEST_ENABLE_BMI2 CMake option, OFF by
default, so a default build stays portable scalar (no BMI2 in the binary, no
SIGILL on pre-BMI2 CPUs). Enabling it wires -mbmi2 through the library; the
intrinsics are additionally guarded to x86 host TUs (never CUDA/HIP device
code). The scalar fallback is byte-identical, and QUEST_BITWISE_FORCE_SCALAR
forces it on a BMI2-capable host.

Tests/benchmark: tests/unit/bitwise.cpp asserts the new helpers are
bit-identical to the originals over exhaustive-small, randomised, and boundary
inputs (bits 31/32/61/62/63 incl. the int64 sign bit); examples/automated adds
a cross-platform benchmark that prints timings and which path was compiled in.

Bit-identical to the scalar path (verified by unit tests, the QuEST suite for
the touched kernels, and amplitude hashes of QFT/random/Grover/VQE circuits).

Closes #717
---
 CMakeLists.txt                                |  31 +++-
 examples/automated/CMakeLists.txt             |  12 ++
 examples/automated/benchmark_bitwise_bmi2.cpp | 153 +++++++++++++++++
 quest/src/core/bitwise.hpp                    |  54 +++++-
 quest/src/cpu/cpu_subroutines.cpp             |  74 +++++---
 tests/unit/CMakeLists.txt                     |  19 ++-
 tests/unit/bitwise.cpp                        | 161 ++++++++++++++++++
 7 files changed, 477 insertions(+), 27 deletions(-)
 create mode 100644 examples/automated/benchmark_bitwise_bmi2.cpp
 create mode 100644 tests/unit/bitwise.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5a438713..c01b2bfcd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,6 +145,13 @@ option(
 )
 message(STATUS "NUMA awareness is turned ${QUEST_ENABLE_NUMA}. Set QUEST_ENABLE_NUMA to modify.")
 
+option(
+  QUEST_ENABLE_BMI2
+  "Whether QuEST will accelerate CPU bit gather/scatter with x86 BMI2 (PEXT/PDEP) intrinsics (issue #717). Turned OFF by default; when ON, the resulting binary requires a BMI2-capable CPU at runtime."
+  OFF
+)
+message(STATUS "BMI2 bitwise acceleration is turned ${QUEST_ENABLE_BMI2}. Set QUEST_ENABLE_BMI2 to modify.")
+
 
 # Distribution
 option(
@@ -402,13 +409,35 @@ else()
   set(WARNING_FLAG -Wall)
 endif()
 
-target_compile_options(QuEST 
+target_compile_options(QuEST
   PRIVATE
   $<$<COMPILE_LANGUAGE:CXX>:${WARNING_FLAG}>
   $<$<COMPILE_LANGUAGE:C>:${WARNING_FLAG}>
 )
 
 
+# ==================================================
+# CPU bit-manipulation acceleration (BMI2, issue #717)
+# ==================================================
+# The PEXT/PDEP fast paths in quest/src/core/bitwise.hpp are guarded by `#if defined(__BMI2__)`,
+# which the compiler only defines when BMI2 codegen is enabled. We add -mbmi2 ONLY when the user opts
+# in via QUEST_ENABLE_BMI2 (OFF by default), so a default build stays portable and runs on any x86 CPU
+# (it compiles the byte-identical scalar fallback). Without the opt-in, -mbmi2 is never added, so the
+# library is free of BMI2 instructions and cannot SIGILL on a pre-BMI2 CPU. The generator expression
+# scopes the flag to C++ host translation units, so CUDA/HIP device compilation is unaffected (and the
+# intrinsics are additionally #ifdef-guarded against __CUDA_ARCH__/__HIP_DEVICE_COMPILE__). A user who
+# instead supplies their own -march=native still gets the fast path on their own CPU.
+if (QUEST_ENABLE_BMI2)
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag("-mbmi2" QUEST_COMPILER_SUPPORTS_MBMI2)
+  if (QUEST_COMPILER_SUPPORTS_MBMI2)
+    target_compile_options(QuEST PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mbmi2>)
+  else()
+    message(WARNING "QUEST_ENABLE_BMI2=ON but the compiler does not accept -mbmi2; building the scalar fallback.")
+  endif()
+endif()
+
+
 
 # ============================
 # Link optional dependencies
diff --git a/examples/automated/CMakeLists.txt b/examples/automated/CMakeLists.txt
index 5880c2ac0..e69169ed3 100644
--- a/examples/automated/CMakeLists.txt
+++ b/examples/automated/CMakeLists.txt
@@ -1,3 +1,15 @@
 # @author Tyson Jones
 
 add_all_local_examples()
+
+# The issue-#717 bitwise micro-benchmark builds with -mbmi2 (so its PEXT/PDEP path is enabled) only
+# when the user opts in via QUEST_ENABLE_BMI2 — same switch the library uses. Without the opt-in it
+# compiles the scalar fallback and prints "BMI2 fast path: INACTIVE" (never SIGILLs). add_example()
+# names the target <filename>_<ext>; the flag is scoped to this one target.
+if (QUEST_ENABLE_BMI2 AND TARGET benchmark_bitwise_bmi2_cpp)
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag("-mbmi2" QUEST_EXAMPLE_SUPPORTS_MBMI2)
+  if (QUEST_EXAMPLE_SUPPORTS_MBMI2)
+    target_compile_options(benchmark_bitwise_bmi2_cpp PRIVATE -mbmi2)
+  endif()
+endif()
diff --git a/examples/automated/benchmark_bitwise_bmi2.cpp b/examples/automated/benchmark_bitwise_bmi2.cpp
new file mode 100644
index 000000000..9f0cbc826
--- /dev/null
+++ b/examples/automated/benchmark_bitwise_bmi2.cpp
@@ -0,0 +1,153 @@
+/** @file
+ * A quick, self-contained micro-benchmark of the BMI2 PEXT/PDEP fast paths added for issue #717,
+ * comparing them against the original scalar bit gather/scatter loops. It prints per-call timings
+ * so QuEST's CI can compare the speedup across its tested platforms and compilers.
+ *
+ * The two scalar routines below mirror getValueOfBits() and insertBitsWithMaskedValues() from
+ * quest/src/core/bitwise.hpp; the BMI2 routines are the single-instruction _pext_u64 / _pdep_u64
+ * paths. This file deliberately depends on nothing but the C++ standard library (and <immintrin.h>
+ * when targeting x86 BMI2), so it compiles and runs on every platform — emitting the scalar
+ * timings alone where BMI2 is unavailable, never raising SIGILL.
+ *
+ * Build note: this target is compiled with -mbmi2 (see examples/automated/CMakeLists.txt) so the
+ * intrinsic path is enabled; the QuEST library itself enables -mbmi2 the same way in the top-level
+ * CMakeLists.txt. Whether the fast path was compiled in is printed at runtime.
+ *
+ * @author (issue #717 contribution)
+ */
+
+#include <cstdint>
+#include <cstdio>
+#include <chrono>
+
+#if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86))
+  #include <immintrin.h>
+  #define BENCH_USE_BMI2
+#endif
+
+using std::uint64_t;
+
+// --- scalar references (mirroring quest/src/core/bitwise.hpp) -------------------------------------
+
+// getValueOfBits: gather the bits at the given (strictly increasing) positions into the low bits.
+static inline uint64_t scalarGather(uint64_t number, const int* inds, int n) {
+    uint64_t value = 0;
+    for (int i=0; i<n; i++)
+        value |= ((number >> inds[i]) & 1ULL) << i;
+    return value;
+}
+
+// insertBitsWithMaskedValues: spread number's low bits into the positions NOT named by inds (i.e.
+// insert a 0 at each increasing index), then OR in the precomputed value mask.
+static inline uint64_t scalarScatter(uint64_t number, const int* inds, int n, uint64_t valueMask) {
+    uint64_t r = number;
+    for (int i=0; i<n; i++) {
+        uint64_t lo = r & ((1ULL << inds[i]) - 1);
+        uint64_t hi = r & ~((1ULL << inds[i]) - 1);
+        r = (hi << 1) | lo;
+    }
+    return valueMask | r;
+}
+
+static inline uint64_t makePosMask(const int* inds, int n) {
+    uint64_t m = 0;
+    for (int i=0; i<n; i++)
+        m |= 1ULL << inds[i];
+    return m;
+}
+
+// --- timing harness ------------------------------------------------------------------------------
+
+static double nsPerCall(uint64_t iters, double seconds) {
+    return 1e9 * seconds / (double) iters;
+}
+
+template <typename F>
+static double timeMin(uint64_t iters, int reps, F&& fn) {
+    double best = 1e300;
+    for (int r=0; r<reps; r++) {
+        auto t0 = std::chrono::steady_clock::now();
+        fn(iters);
+        auto t1 = std::chrono::steady_clock::now();
+        double s = std::chrono::duration<double>(t1 - t0).count();
+        if (s < best) best = s;
+    }
+    return best;
+}
+
+int main() {
+
+    printf("QuEST issue #717 - BMI2 PEXT/PDEP bitwise micro-benchmark\n");
+#ifdef BENCH_USE_BMI2
+    printf("BMI2 fast path: ACTIVE (compiled with -mbmi2)\n\n");
+#else
+    printf("BMI2 fast path: INACTIVE (x86 BMI2 not targeted; scalar timings only)\n\n");
+#endif
+
+    const uint64_t iters = 8000000;   // keeps total runtime well under a second
+    const int reps = 3;
+    const int counts[] = {3, 6};      // representative qubit-arity per gate
+
+    printf("%-8s %-4s %14s %14s %10s\n", "op", "k", "scalar ns/call", "bmi2 ns/call", "speedup");
+
+    for (int ci=0; ci<2; ci++) {
+        int k = counts[ci];
+
+        // a fixed, strictly-increasing index set and a value mask consistent with it
+        int inds[8];
+        for (int i=0; i<k; i++) inds[i] = 3*i + 1;
+        uint64_t posMask = makePosMask(inds, k);
+        uint64_t valueMask = posMask & 0xA5A5A5A5A5A5A5A5ULL;
+
+        volatile uint64_t sink = 0;
+
+        // ---- gather (getValueOfBits) ----
+        double sg = timeMin(iters, reps, [&](uint64_t N){
+            uint64_t acc = 0;
+            for (uint64_t n=0; n<N; n++) acc ^= scalarGather(n, inds, k);
+            sink ^= acc;
+        });
+#ifdef BENCH_USE_BMI2
+        double bg = timeMin(iters, reps, [&](uint64_t N){
+            uint64_t acc = 0;
+            for (uint64_t n=0; n<N; n++) acc ^= (uint64_t) _pext_u64(n, posMask);
+            sink ^= acc;
+        });
+        printf("%-8s %-4d %14.3f %14.3f %9.2fx\n", "gather", k,
+               nsPerCall(iters, sg), nsPerCall(iters, bg), sg/bg);
+#else
+        printf("%-8s %-4d %14.3f %14s %10s\n", "gather", k, nsPerCall(iters, sg), "-", "-");
+#endif
+
+        // ---- scatter (insertBitsWithMaskedValues) ----
+        double ss = timeMin(iters, reps, [&](uint64_t N){
+            uint64_t acc = 0;
+            for (uint64_t n=0; n<N; n++) acc ^= scalarScatter(n, inds, k, valueMask);
+            sink ^= acc;
+        });
+#ifdef BENCH_USE_BMI2
+        double bs = timeMin(iters, reps, [&](uint64_t N){
+            uint64_t acc = 0;
+            for (uint64_t n=0; n<N; n++) acc ^= (valueMask | (uint64_t) _pdep_u64(n, ~posMask));
+            sink ^= acc;
+        });
+        printf("%-8s %-4d %14.3f %14.3f %9.2fx\n", "scatter", k,
+               nsPerCall(iters, ss), nsPerCall(iters, bs), ss/bs);
+#else
+        printf("%-8s %-4d %14.3f %14s %10s\n", "scatter", k, nsPerCall(iters, ss), "-", "-");
+#endif
+
+#ifdef BENCH_USE_BMI2
+        // sanity: the intrinsic and scalar paths must agree (bit-for-bit) for these sorted indices
+        bool ok = true;
+        for (uint64_t n=0; n<4096 && ok; n++) {
+            if ((uint64_t)_pext_u64(n, posMask) != scalarGather(n, inds, k)) ok = false;
+            if ((valueMask | (uint64_t)_pdep_u64(n, ~posMask)) != scalarScatter(n, inds, k, valueMask)) ok = false;
+        }
+        printf("           (k=%d results verified bit-identical to scalar: %s)\n", k, ok ? "yes" : "NO");
+#endif
+        (void) sink;
+    }
+
+    return 0;
+}
diff --git a/quest/src/core/bitwise.hpp b/quest/src/core/bitwise.hpp
index f5266afa4..5e3298d5a 100644
--- a/quest/src/core/bitwise.hpp
+++ b/quest/src/core/bitwise.hpp
@@ -14,12 +14,22 @@
   #include <intrin.h>
 #endif
 
+// Optional BMI2 PEXT/PDEP fast paths for the bit gather/scatter helpers below (issue #717).
+// Active only when BMI2 is actually targeted (__BMI2__), i.e. when the build opts in with
+// -DQUEST_ENABLE_BMI2=ON or the user supplies their own -march=native; a default build defines no
+// such flag and compiles the byte-identical scalar fallback, so it stays portable. Restricted to x86
+// host compilation (never CUDA/HIP device code, where INLINE becomes __device__). Define
+// QUEST_BITWISE_FORCE_SCALAR to force the scalar path even on a BMI2-capable host.
+#if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)) \
+    && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(QUEST_BITWISE_FORCE_SCALAR)
+  #include <immintrin.h>
+  #define QUEST_BITWISE_USE_BMI2
+#endif
+
 #include "quest/include/types.h"
 
 #include "quest/src/core/inliner.hpp"
 
-
-
 /* 
  * PERFORMANCE-CRITICAL FUNCTIONS
  *
@@ -212,6 +222,46 @@ INLINE qindex insertBitsWithMaskedValues(qindex number, const int* bitInds, int
 }
 
 
+/*
+ * Mask-accepting variants of the bit gather/scatter helpers (issue #717).
+ *
+ * The caller computes the loop-invariant POSITION mask (a bit set at every index in bitInds)
+ * once, before the exponentially-large statevector loop, e.g.
+ *     qindex posMask = getBitMask(sortedInds.data(), numInds);
+ * so each per-amplitude call collapses to a single PDEP/PEXT instruction instead of an
+ * O(numBits) loop. bitInds/numBits are retained so that, when BMI2 is unavailable, the fallback
+ * reuses the original unrolled scalar routines and stays byte-identical.
+ */
+
+INLINE qindex insertBitsWithMaskedValuesAndPosMask(qindex number, qindex valueMask, [[maybe_unused]] qindex posMask, [[maybe_unused]] const int* bitInds, [[maybe_unused]] int numBits) {
+#ifdef QUEST_BITWISE_USE_BMI2
+    return valueMask | (qindex) _pdep_u64((unsigned long long) number, ~ (unsigned long long) posMask);
+#else
+    return valueMask | insertBits(number, bitInds, numBits, 0);
+#endif
+}
+
+INLINE qindex getValueOfBitsFromSortedPosMask(qindex number, [[maybe_unused]] qindex posMask, [[maybe_unused]] const int* bitInds, [[maybe_unused]] int numBits) {
+    // PEXT emits the gathered bits in ascending position order, so this matches getValueOfBits
+    // only when bitInds are strictly increasing. The caller checks that once per gate (see
+    // isStrictlyIncreasing) and supplies posMask = getBitMask(bitInds, numBits).
+#ifdef QUEST_BITWISE_USE_BMI2
+    return (qindex) _pext_u64((unsigned long long) number, (unsigned long long) posMask);
+#else
+    return getValueOfBits(number, bitInds, numBits);
+#endif
+}
+
+// Checked once per gate (loop-invariant), never per amplitude: getValueOfBits is order-sensitive,
+// so the PEXT path above is valid only when bitInds are strictly increasing.
+INLINE bool isStrictlyIncreasing(const int* bitInds, int numBits) {
+    for (int i=1; i<numBits; i++)
+        if (bitInds[i-1] >= bitInds[i])
+            return false;
+    return true;
+}
+
+
 INLINE int getTwoBits(qindex number, int highInd, int lowInd) {
 
     int b1 = getBit(number, lowInd);
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 59df946e9..ec8621021 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -236,11 +236,12 @@ qindex cpu_statevec_packAmpsIntoBuffer(Qureg qureg, ConstList64 qubitInds, Const
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubitInds.size());
 
+    qindex qubitsPosMask = getBitMask(sortedQubitInds.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where qubits are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedQubitInds.data(), numBits, qubitStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubitInds.data(), numBits);
 
         // pack the potentially-strided amplitudes into a contiguous sub-buffer
         buffer[offset + n] = amps[i];
@@ -308,11 +309,12 @@ void cpu_statevec_anyCtrlSwap_subA(Qureg qureg, ConstList64 ctrls, ConstList64 c
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     int numQubitBits = numCtrlBits + 2;
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i01 = nth local index where ctrls are active, targ2=0 and targ1=1
-        qindex i01 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+        qindex i01 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
         qindex i10 = flipTwoBits(i01, targ2, targ1);
 
         std::swap(amps[i01], amps[i10]);
@@ -341,11 +343,12 @@ void cpu_statevec_anyCtrlSwap_subB(Qureg qureg, ConstList64 ctrls, ConstList64 c
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where ctrls are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // j = index of nth received amplitude from pair rank in buffer
         qindex j = n + offset;
@@ -378,11 +381,12 @@ void cpu_statevec_anyCtrlSwap_subC(Qureg qureg, ConstList64 ctrls, ConstList64 c
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     int numQubitBits = numCtrlBits + 1;
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where ctrls and targ are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
     
         // j = index of nth received amplitude from pair rank in buffer
         qindex j = n + offset;
@@ -423,11 +427,13 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, ConstList64 ctrls, C
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     int numQubitBits = numCtrlBits + 1;
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
+
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i0 = nth local index where ctrl bits are in specified states and targ is 0
-        qindex i0 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+        qindex i0 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
         qindex i1 = flipBit(i0, targ);
 
         // note the two amplitudes are likely strided and not adjacent (separated by 2^t)
@@ -463,11 +469,12 @@ void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, ConstList64 ctrls, C
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where ctrl bits are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // j = index of nth received amplitude from pair rank in buffer
         qindex j = n + offset;
@@ -506,11 +513,12 @@ void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     int numQubitBits = numCtrlBits + 2;
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i0 = nth local index where ctrl bits are in specified states and both targs are 0
-        qindex i00 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+        qindex i00 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
         qindex i01 = flipBit(i00, targ1);
         qindex i10 = flipBit(i00, targ2);
         qindex i11 = flipBit(i01, targ2);
@@ -588,11 +596,12 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, ConstList64 ctrls, Co
         // create a private cache for every thread (might be compile-time sized, and in heap or stack)
         vector<cpu_qcomp> cache(numTargAmps);
 
+        qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
         #pragma omp for
         for (qindex n=0; n<numIts; n++) {
 
             // i0 = nth local index where ctrls are active and targs are all zero
-            qindex i0 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+            qindex i0 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
 
             // collect and cache all to-be-modified amps (loop might be unrolled)
             for (qindex j=0; j<numTargAmps; j++) {
@@ -669,11 +678,12 @@ void cpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     // use template params to compile-time unroll loops in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // j = nth local index where ctrls are active (in the specified states)
-        qindex j = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex j = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // i = global index corresponding to j
         qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode);
@@ -711,11 +721,12 @@ void cpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     // use template params to compile-time unroll loops in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // j = nth local index where ctrls are active (in the specified states)
-        qindex j = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex j = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // i = global index corresponding to j
         qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode);
@@ -758,17 +769,20 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, ConstList64 ctrls, Con
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
     SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, targs.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    qindex targsPosMask = getBitMask(targs.data(), numTargBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    bool targsSorted = isStrictlyIncreasing(targs.data(), numTargBits);  // likewise loop-invariant (order checked once per gate)
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // j = nth local index where ctrls are active (in the specified states)
-        qindex j = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex j = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // i = global index corresponding to j
         qindex i = concatenateBits(qureg.rank, j, qureg.logNumAmpsPerNode);
 
         // t = value of targeted bits, which may be in the prefix substate
-        qindex t = getValueOfBits(i, targs.data(), numTargBits);
+        qindex t = (targsSorted ? getValueOfBitsFromSortedPosMask(i, targsPosMask, targs.data(), numTargBits) : getValueOfBits(i, targs.data(), numTargBits));
         cpu_qcomp elem = elems[t];
 
         // decide whether to power and conj at compile-time, to avoid branching in hot-loop.
@@ -1012,11 +1026,12 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) {
     
         // parallel
+        qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
         #pragma omp parallel for if(qureg.isMultithreaded)
         for (qindex n=0; n<numOuterIts; n++) {
 
             // i0 = nth local index where ctrls are active and targs are all zero
-            qindex i0 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+            qindex i0 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
 
             // serial
             for (qindex v=0; v<numInnerIts; v++)
@@ -1027,10 +1042,11 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     } else {
 
         // serial
+        qindex qubitsPosMask = getBitMask(sortedQubits.data(), numQubitBits);  // loop-invariant: hoisted out of the per-amplitude loop
         for (qindex n=0; n<numOuterIts; n++) {
 
             // i0 = nth local index where ctrls are active and targs are all zero
-            qindex i0 = insertBitsWithMaskedValues(n, sortedQubits.data(), numQubitBits, qubitStateMask);
+            qindex i0 = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numQubitBits);
 
             // parallel
             #pragma omp parallel for
@@ -1072,11 +1088,12 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subB(
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where ctrl bits are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // j = buffer index of amp to be mixed with i
         qindex j = flipBits(n, bufferMaskXY) + offset;
@@ -1122,11 +1139,12 @@ void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, ctrls.size());
 
+    qindex ctrlsPosMask = getBitMask(sortedCtrls.data(), numCtrlBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where ctrl bits are in specified states
-        qindex i = insertBitsWithMaskedValues(n, sortedCtrls.data(), numCtrlBits, ctrlStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, ctrlStateMask, ctrlsPosMask, sortedCtrls.data(), numCtrlBits);
 
         // apply phase to amp depending on parity of targets
         int p = getBitMaskParity(i & targMask);
@@ -2009,11 +2027,12 @@ qreal cpu_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubi
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = nth local index where qubits are in the specified outcome state
-        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask);
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numBits);
 
         prob += norm(amps[i]);
     }
@@ -2046,11 +2065,12 @@ qreal cpu_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, ConstList64 qubi
     // use template param to compile-time unroll loop in insertBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
 
+    qindex qubitsPosMask = getBitMask(sortedQubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
     #pragma omp parallel for reduction(+:prob) if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // i = local statevector index of nth local basis state with a contributing diagonal
-        qindex i = insertBitsWithMaskedValues(n, sortedQubits.data(), numBits, qubitStateMask); // may be unrolled at compile-time
+        qindex i = insertBitsWithMaskedValuesAndPosMask(n, qubitStateMask, qubitsPosMask, sortedQubits.data(), numBits);
 
         // j = local, flat, density-matrix index of diagonal amp corresponding to state i
         qindex j = fast_getQuregLocalIndexOfDiagonalAmp(i, firstDiagInd, numAmpsPerCol);
@@ -2086,6 +2106,8 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     for (int i=0; i<numOutcomes; i++)
         outProbs[i] = 0;
     
+    qindex qubitsPosMask = getBitMask(qubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    bool qubitsSorted = isStrictlyIncreasing(qubits.data(), numBits);  // likewise loop-invariant (order checked once per gate)
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2095,7 +2117,7 @@ void cpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
         qindex i = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode);
 
         // j = outcome index corresponding to prob
-        qindex j = getValueOfBits(i, qubits.data(), numBits); // loop therein may be unrolled
+        qindex j = (qubitsSorted ? getValueOfBitsFromSortedPosMask(i, qubitsPosMask, qubits.data(), numBits) : getValueOfBits(i, qubits.data(), numBits));
 
         #pragma omp atomic
         outProbs[j] += prob;
@@ -2129,6 +2151,8 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
     for (int i=0; i<numOutcomes; i++)
         outProbs[i] = 0;
 
+    qindex qubitsPosMask = getBitMask(qubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    bool qubitsSorted = isStrictlyIncreasing(qubits.data(), numBits);  // likewise loop-invariant (order checked once per gate)
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2140,7 +2164,7 @@ void cpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu
         qindex j = concatenateBits(qureg.rank, i, qureg.logNumAmpsPerNode);
 
         // k = outcome index corresponding to basis state j
-        qindex k = getValueOfBits(j, qubits.data(), numBits); // loop therein may be unrolled
+        qindex k = (qubitsSorted ? getValueOfBitsFromSortedPosMask(j, qubitsPosMask, qubits.data(), numBits) : getValueOfBits(j, qubits.data(), numBits));
 
         #pragma omp atomic
         outProbs[k] += prob;
@@ -2581,11 +2605,13 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
     // use template param to compile-time unroll loop in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
 
+    qindex qubitsPosMask = getBitMask(qubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    bool qubitsSorted = isStrictlyIncreasing(qubits.data(), numBits);  // likewise loop-invariant (order checked once per gate)
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
         // val = outcomes corresponding to n-th local amp (all qubits are in suffix)
-        qindex val = getValueOfBits(n, qubits.data(), numBits);
+        qindex val = (qubitsSorted ? getValueOfBitsFromSortedPosMask(n, qubitsPosMask, qubits.data(), numBits) : getValueOfBits(n, qubits.data(), numBits));
 
         // multiply amp with renorm or zero, if qubit value matches or disagrees
         amps[n] *= renorm * (val == retainValue);
@@ -2616,6 +2642,8 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
     // use template param to compile-time unroll loops in getValueOfBits()
     SET_VAR_AT_COMPILE_TIME(int, numBits, NumQubits, qubits.size());
 
+    qindex qubitsPosMask = getBitMask(qubits.data(), numBits);  // loop-invariant: hoisted out of the per-amplitude loop
+    bool qubitsSorted = isStrictlyIncreasing(qubits.data(), numBits);  // likewise loop-invariant (order checked once per gate)
     #pragma omp parallel for if(qureg.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
@@ -2626,8 +2654,8 @@ void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, ConstList64 qubits, Const
         qindex r = getBitsRightOfIndex(i, qureg.numQubits);
         qindex c = getBitsLeftOfIndex(i, qureg.numQubits-1);
 
-        qindex v1 = getValueOfBits(r, qubits.data(), numBits);
-        qindex v2 = getValueOfBits(c, qubits.data(), numBits);
+        qindex v1 = (qubitsSorted ? getValueOfBitsFromSortedPosMask(r, qubitsPosMask, qubits.data(), numBits) : getValueOfBits(r, qubits.data(), numBits));
+        qindex v2 = (qubitsSorted ? getValueOfBitsFromSortedPosMask(c, qubitsPosMask, qubits.data(), numBits) : getValueOfBits(c, qubits.data(), numBits));
 
         // multiply amp with renorm or zero if values disagree with given outcomes
         amps[n] *= renorm * (v1 == v2) * (retainValue == v1);
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 59341759f..80cc58eaf 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -2,6 +2,7 @@
 
 target_sources(tests
   PUBLIC
+  bitwise.cpp
   calculations.cpp
   channels.cpp
   debug.cpp
@@ -16,4 +17,20 @@ target_sources(tests
   qureg.cpp
   trotterisation.cpp
   types.cpp
-)
\ No newline at end of file
+)
+
+# When the user opts in via QUEST_ENABLE_BMI2, compile only the issue-#717 bitwise test with -mbmi2 so
+# it exercises the actual PEXT/PDEP path; otherwise (the default) it exercises the scalar fallback. The
+# assertions hold identically either way. CMP0118: source properties are visible to the target's
+# directory, so set it with explicit TARGET_DIRECTORY for the parent-scope 'tests' target.
+if (QUEST_ENABLE_BMI2)
+  include(CheckCXXCompilerFlag)
+  check_cxx_compiler_flag("-mbmi2" QUEST_TEST_SUPPORTS_MBMI2)
+  if (QUEST_TEST_SUPPORTS_MBMI2)
+    set_source_files_properties(
+      bitwise.cpp
+      TARGET_DIRECTORY tests
+      PROPERTIES COMPILE_OPTIONS "-mbmi2"
+    )
+  endif()
+endif()
\ No newline at end of file
diff --git a/tests/unit/bitwise.cpp b/tests/unit/bitwise.cpp
new file mode 100644
index 000000000..e5c4787a1
--- /dev/null
+++ b/tests/unit/bitwise.cpp
@@ -0,0 +1,161 @@
+/** @file
+ * Unit tests for the BMI2 bit gather/scatter helpers added for issue #717
+ * (quest/src/core/bitwise.hpp). The optimisation changes only how basis-state
+ * indices are computed, so the new mask-accepting helpers must be bit-for-bit
+ * identical to the original scalar routines they accelerate. These tests assert
+ * exactly that, over exhaustive-small and randomised inputs.
+ *
+ * When this translation unit is compiled with BMI2 enabled (see
+ * tests/unit/CMakeLists.txt) the helpers exercise the _pext_u64 / _pdep_u64
+ * intrinsics; otherwise they exercise the scalar fallback. Both must agree with
+ * the originals, so the same assertions hold either way.
+ *
+ * @author (issue #717 contribution)
+ *
+ * @defgroup unitbitwise Bitwise
+ * @ingroup unittests
+ */
+
+#include "quest/src/core/bitwise.hpp"
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <vector>
+#include <random>
+#include <algorithm>
+
+namespace {
+
+    // k distinct indices in [0, maxBit), returned strictly increasing
+    std::vector<int> randomIncreasingInds(std::mt19937_64& rng, int k, int maxBit) {
+        std::vector<int> pool(maxBit);
+        for (int i=0; i<maxBit; i++)
+            pool[i] = i;
+        std::shuffle(pool.begin(), pool.end(), rng);
+        std::vector<int> inds(pool.begin(), pool.begin() + k);
+        std::sort(inds.begin(), inds.end());
+        return inds;
+    }
+}
+
+TEST_CASE( "issue #717 helpers compiled path", "[bitwise]" ) {
+
+    // surfaced in the CI log so it is clear which path these tests exercised
+#ifdef QUEST_BITWISE_USE_BMI2
+    WARN( "bitwise helpers compiled with BMI2 PEXT/PDEP enabled" );
+#else
+    WARN( "bitwise helpers compiled with the scalar fallback (BMI2 not targeted)" );
+#endif
+    SUCCEED();
+}
+
+TEST_CASE( "getValueOfBitsFromSortedPosMask matches getValueOfBits", "[bitwise]" ) {
+
+    std::mt19937_64 rng(0x717ULL);
+
+    for (int k=0; k<=12; k++) {
+        for (int trial=0; trial<200; trial++) {
+
+            std::vector<int> inds = (k==0)
+                ? std::vector<int>{}
+                : randomIncreasingInds(rng, k, 50);
+
+            qindex posMask = getBitMask(inds.data(), k);
+
+            for (int s=0; s<8; s++) {
+                qindex number = (qindex) (rng() & ((1ULL<<50) - 1));   // bits live in [0,50)
+                REQUIRE(
+                    getValueOfBitsFromSortedPosMask(number, posMask, inds.data(), k) ==
+                    getValueOfBits(number, inds.data(), k) );
+            }
+        }
+    }
+}
+
+TEST_CASE( "insertBitsWithMaskedValuesAndPosMask matches insertBitsWithMaskedValues", "[bitwise]" ) {
+
+    std::mt19937_64 rng(0x718ULL);
+
+    for (int k=0; k<=12; k++) {
+        for (int trial=0; trial<200; trial++) {
+
+            std::vector<int> inds = (k==0)
+                ? std::vector<int>{}
+                : randomIncreasingInds(rng, k, 50);
+
+            qindex posMask = getBitMask(inds.data(), k);
+
+            // per the original contract, the value mask is zero except at the inserted positions
+            qindex valueMask = ((qindex) rng()) & posMask;
+
+            for (int s=0; s<8; s++) {
+                qindex number = (qindex) (rng() & ((1ULL<<40) - 1));   // avoid shifting bits past bit 63
+                REQUIRE(
+                    insertBitsWithMaskedValuesAndPosMask(number, valueMask, posMask, inds.data(), k) ==
+                    insertBitsWithMaskedValues(number, inds.data(), k, valueMask) );
+            }
+        }
+    }
+}
+
+TEST_CASE( "helpers match at boundary bit positions", "[bitwise]" ) {
+
+    // Deterministic coverage of the awkward positions the randomised tests above never reach:
+    // the 32-bit word boundary (31/32) and the high bits 61/62/63 — bit 63 being the sign bit of the
+    // signed qindex, where the scalar (arithmetic-shift) and BMI2 (unsigned PEXT/PDEP) paths are most
+    // likely to disagree if anything is wrong.
+    const std::vector<std::vector<int>> indexSets = {
+        {31}, {32}, {63}, {31, 32}, {62, 63}, {0, 63},
+        {0, 31, 32, 63}, {30, 31, 32, 33}, {59, 60, 61, 62, 63},
+    };
+    const std::vector<unsigned long long> numbers = {
+        0ULL,
+        ~0ULL,                          // all bits set
+        1ULL << 63,                     // only the sign bit
+        (1ULL << 63) | 1ULL,            // sign bit + bit 0
+        0x00000000FFFFFFFFULL,          // low 32
+        0xFFFFFFFF00000000ULL,          // high 32
+        (1ULL << 31) | (1ULL << 32),    // straddle the word boundary
+        0xAAAAAAAAAAAAAAAAULL,          // alternating
+        0x5555555555555555ULL,
+    };
+
+    for (const auto& inds : indexSets) {
+        int k = (int) inds.size();
+        qindex posMask = getBitMask(inds.data(), k);
+
+        for (unsigned long long raw : numbers) {
+
+            // gather: any 64-bit input is valid (reads bits, incl. bit 63 of a negative qindex)
+            qindex g = (qindex) raw;
+            REQUIRE(
+                getValueOfBitsFromSortedPosMask(g, posMask, inds.data(), k) ==
+                getValueOfBits(g, inds.data(), k) );
+
+            // insert: keep the input within its low (64-k) significant bits so the scalar reference
+            // is well-defined (no shift past bit 63); still lets a high input bit land on position 63.
+            unsigned long long fitMask = (k == 0) ? ~0ULL : ((1ULL << (64 - k)) - 1);
+            qindex n = (qindex) (raw & fitMask);
+            for (qindex valueMask : { (qindex) 0, (qindex) (g & posMask) }) {
+                REQUIRE(
+                    insertBitsWithMaskedValuesAndPosMask(n, valueMask, posMask, inds.data(), k) ==
+                    insertBitsWithMaskedValues(n, inds.data(), k, valueMask) );
+            }
+        }
+    }
+}
+
+TEST_CASE( "isStrictlyIncreasing detects order", "[bitwise]" ) {
+
+    int sorted[] = {0, 2, 5, 9};
+    int equalAdj[] = {0, 2, 2, 9};
+    int decreasing[] = {9, 5, 2, 0};
+
+    REQUIRE( isStrictlyIncreasing(sorted, 4) );
+    REQUIRE_FALSE( isStrictlyIncreasing(equalAdj, 4) );
+    REQUIRE_FALSE( isStrictlyIncreasing(decreasing, 4) );
+
+    // trivially ordered for 0 or 1 elements
+    REQUIRE( isStrictlyIncreasing(sorted, 1) );
+    REQUIRE( isStrictlyIncreasing(sorted, 0) );
+}

From ce7c3ae0b839de7bf131baaebe117d570aee5389 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 15 Jun 2026 15:20:57 -0400
Subject: [PATCH 2/2] Tailor CI

---
 .github/workflows/compile.yml   | 62 +++++++++++++++++++--------------
 .github/workflows/test_free.yml | 12 ++++---
 2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index c86de84f1..c66be8dbe 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -23,6 +23,10 @@
 name: compile
 
 
+### DEBUG
+### disabled all but single-CPU
+
+
 on:
   push:
     branches:
@@ -60,14 +64,14 @@ jobs:
 
       # compile QuEST with all combinations of below flags
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-latest]
-        precision: [1, 2, 4]
-        omp:       [ON, OFF]
-        mpi:       [ON, OFF]
-        cuda:      [ON, OFF]
-        hip:       [ON, OFF]
-        cuquantum: [ON, OFF]
-        mpilib:    ['', 'mpich', 'ompi', 'impi', 'msmpi']
+        os: [windows-latest, ubuntu-latest, macos-latest, macos-15-intel, macos-26-intel]
+        precision: [2] #[1, 2, 4]
+        omp:       [OFF] #[ON, OFF]
+        mpi:       [OFF] #[ON, OFF]
+        cuda:      [OFF] #[ON, OFF]
+        hip:       [OFF] #[ON, OFF]
+        cuquantum: [OFF] #[ON, OFF]
+        mpilib:    [''] #['', 'mpich', 'ompi', 'impi', 'msmpi']
 
         # disable deprecated API on MSVC, and assign unique compilers,
         # so that we can concisely consult e.g. matrix.compiler=='cl'
@@ -240,7 +244,7 @@ jobs:
         run: >
           cmake -B ${{ env.build_dir }}
           -DQUEST_BUILD_EXAMPLES=ON
-          -DQUEST_BUILD_TESTS=ON
+          -DQUEST_BUILD_TESTS=OFF
           -DQUEST_FLOAT_PRECISION=${{ matrix.precision }}
           -DQUEST_ENABLE_DEPRECATED_API=${{ matrix.deprecated }}
           -DQUEST_DISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
@@ -260,24 +264,24 @@ jobs:
 
       # run all compiled isolated examples to test for link-time errors,
       # continuing if any fail (since some deliberately fail)
-      - name: Run isolated examples (Windows)
-        if: ${{ matrix.os == 'windows-latest' }}
-        working-directory: ${{ env.isolated_dir }}/Release/
-        shell: pwsh
-        run: |
-          Get-ChildItem -Filter '*.exe' -File |
-          ForEach-Object {
-            Write-Host "`r`n[[[ $($_.Name) ]]]`r`n"
-            & $_.FullName
-          }
-      - name: Run isolated examples (Unix)
-        if: ${{ matrix.os != 'windows-latest' }}
-        working-directory: ${{ env.isolated_dir }}
-        run: |
-          for fn in *_c *_cpp; do
-            printf "\n[[[ $fn ]]]\n"
-            ./$fn || true
-          done
+      # - name: Run isolated examples (Windows)
+      #   if: ${{ matrix.os == 'windows-latest' }}
+      #   working-directory: ${{ env.isolated_dir }}/Release/
+      #   shell: pwsh
+      #   run: |
+      #     Get-ChildItem -Filter '*.exe' -File |
+      #     ForEach-Object {
+      #       Write-Host "`r`n[[[ $($_.Name) ]]]`r`n"
+      #       & $_.FullName
+      #     }
+      # - name: Run isolated examples (Unix)
+      #   if: ${{ matrix.os != 'windows-latest' }}
+      #   working-directory: ${{ env.isolated_dir }}
+      #   run: |
+      #     for fn in *_c *_cpp; do
+      #       printf "\n[[[ $fn ]]]\n"
+      #       ./$fn || true
+      #     done
 
       # run all compiled 'automated' examples
       - name: Run automated examples (Windows)
@@ -289,6 +293,10 @@ jobs:
           ForEach-Object {
             Write-Host "`r`n[[[ $($_.Name) ]]]`r`n"
             & $_.FullName
+            if ($LASTEXITCODE -ne 0) {
+              Write-Warning "$($_.Name) exited with code $LASTEXITCODE"
+              $global:LASTEXITCODE = 0
+            }
           }
       - name: Run automated examples (Unix)
         if: ${{ matrix.os != 'windows-latest' }}
diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml
index 2d332e842..f6c20e1dd 100644
--- a/.github/workflows/test_free.yml
+++ b/.github/workflows/test_free.yml
@@ -10,6 +10,10 @@
 name: test (free, serial)
 
 
+### DEBUG
+### disabled all but single-CPU
+
+
 on:
   push:
     branches:
@@ -27,7 +31,7 @@ jobs:
   # excluding the v4 integration tests, for free
   serial-unit-test:
     name: >
-      ${{ matrix.os == 'ubuntu-latest' && 'Linux' || matrix.os == 'macos-latest' && 'MacOS' || 'Windows' }}
+      ${{ matrix.os == 'ubuntu-latest' && 'Linux' || startsWith(matrix.os, 'macos') && 'MacOS' || 'Windows' }}
       [${{ matrix.precision }}]
       serial
       unit v${{ matrix.version }}
@@ -40,9 +44,9 @@ jobs:
 
       # we will compile QuEST with all precisions but no parallelisation
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-        version: [3, 4]
-        precision: [1, 2, 4]
+        os: [ubuntu-latest, macos-latest, windows-latest,  macos-15-intel, macos-26-intel]
+        version: [4] # [3, 4]
+        precision: [2] # [1, 2, 4]
 
         # MSVC cannot compile deprecated v3 tests
         exclude: