From 0d5310517aa337e611aaa1c1e87e4a94fe3033ca Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Feb 2026 16:16:50 -0500 Subject: [PATCH 01/11] measure scans Signed-off-by: Andrew Duffy fixup Signed-off-by: Andrew Duffy --- Cargo.lock | 26 +++ Cargo.toml | 1 + _typos.toml | 2 +- .../src/arrays/primitive/vtable/mod.rs | 5 + vortex-btrblocks/src/builder.rs | 9 + vortex-cuda/Cargo.toml | 8 +- vortex-cuda/benches/bitpacked_cuda.rs | 79 ++------ vortex-cuda/benches/common/mod.rs | 39 ++++ vortex-cuda/benches/date_time_parts_cuda.rs | 106 ++--------- vortex-cuda/benches/dict_cuda.rs | 94 +++------- vortex-cuda/benches/for_cuda.rs | 123 +++++++----- vortex-cuda/benches/runend_cuda.rs | 90 ++------- vortex-cuda/src/executor.rs | 72 ++++++- vortex-cuda/src/kernel/arrays/constant.rs | 35 ++-- vortex-cuda/src/kernel/arrays/dict.rs | 59 +++--- vortex-cuda/src/kernel/arrays/shared.rs | 5 + vortex-cuda/src/kernel/encodings/alp.rs | 29 ++- vortex-cuda/src/kernel/encodings/bitpacked.rs | 24 +-- .../src/kernel/encodings/date_time_parts.rs | 24 +-- .../kernel/encodings/decimal_byte_parts.rs | 5 + vortex-cuda/src/kernel/encodings/for_.rs | 20 +- vortex-cuda/src/kernel/encodings/runend.rs | 28 +-- vortex-cuda/src/kernel/encodings/sequence.rs | 20 +- vortex-cuda/src/kernel/encodings/zigzag.rs | 20 +- vortex-cuda/src/kernel/encodings/zstd.rs | 54 +++++- vortex-cuda/src/kernel/filter/mod.rs | 4 + vortex-cuda/src/kernel/mod.rs | 93 ++++----- vortex-cuda/src/kernel/patches/mod.rs | 25 +-- vortex-cuda/src/kernel/slice/mod.rs | 4 + vortex-cuda/src/lib.rs | 35 ++-- vortex-cuda/src/macros.rs | 42 +++++ vortex-cuda/src/session.rs | 7 +- vortex-cuda/src/stream.rs | 10 +- vortex-python/src/arrow.rs | 2 +- vortex-test/e2e-cuda-scan/Cargo.toml | 24 +++ vortex-test/e2e-cuda-scan/src/main.rs | 177 ++++++++++++++++++ vortex/src/lib.rs | 3 + 37 files changed, 831 insertions(+), 572 deletions(-) create mode 100644 vortex-cuda/benches/common/mod.rs create mode 100644 vortex-cuda/src/macros.rs create mode 100644 vortex-test/e2e-cuda-scan/Cargo.toml create mode 100644 vortex-test/e2e-cuda-scan/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index ff4e930bf21..5034b68ef4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9549,6 +9549,16 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.22" @@ -9559,12 +9569,15 @@ dependencies = [ "nu-ansi-term", "once_cell", "regex-automata", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -10088,6 +10101,7 @@ dependencies = [ "arrow-schema", "async-trait", "bindgen", + "bytes", "codspeed-criterion-compat-walltime", "cudarc", "fastlanes", @@ -10797,6 +10811,18 @@ dependencies = [ "vortex-cuda", ] +[[package]] +name = "vortex-test-e2e-cuda-scan" +version = "0.1.0" +dependencies = [ + "futures", + "tokio", + "tracing", + "tracing-subscriber", + "vortex", + "vortex-cuda", +] + [[package]] name = "vortex-tui" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 50d6bebe4d5..349af16c976 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ members = [ "vortex-tui", "vortex-test/e2e", "vortex-test/e2e-cuda", + "vortex-test/e2e-cuda-scan", "xtask", # Encodings "encodings/fastlanes", diff --git a/_typos.toml b/_typos.toml index b2af33e423b..5d601d1036f 100644 --- a/_typos.toml +++ b/_typos.toml @@ -1,5 +1,5 @@ [default] -extend-ignore-identifiers-re = ["FoR", "typ"] +extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"] # We support a few common special comments to tell the checker to ignore sections of code extend-ignore-re = [ "(#|//)\\s*spellchecker:ignore-next-line\\n.*", # Ignore the next line diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs index d316e6e93cd..450f9faf99e 100644 --- a/vortex-array/src/arrays/primitive/vtable/mod.rs +++ b/vortex-array/src/arrays/primitive/vtable/mod.rs @@ -89,6 +89,11 @@ impl VTable for PrimitiveVTable { let ptype = PType::try_from(dtype)?; + vortex_ensure!( + buffer.is_aligned_to(Alignment::new(ptype.byte_width())), + "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}" + ); + if buffer.len() != ptype.byte_width() * len { vortex_bail!( "Buffer length {} does not match expected length {} for {}, {}", diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index b71ca7f9caf..d329ec8c139 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder { } impl BtrBlocksCompressorBuilder { + /// Create a new builder with no encodings enabled. + pub fn empty() -> Self { + Self { + int_schemes: Default::default(), + float_schemes: Default::default(), + string_schemes: Default::default(), + } + } + /// Excludes the specified integer compression schemes. pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index b17c6b75694..197890664d5 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -18,6 +18,7 @@ workspace = true [features] default = [] +tracing = ["dep:tracing"] _test-harness = [] unstable_encodings = ["vortex-zstd/unstable_encodings"] @@ -26,12 +27,17 @@ arc-swap = { workspace = true } arrow-data = { workspace = true, features = ["ffi"] } arrow-schema = { workspace = true, features = ["ffi"] } async-trait = { workspace = true } +bytes = { workspace = true } cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } kanal = { workspace = true } paste = { workspace = true } -tracing = { workspace = true } +tokio = { workspace = true, features = ["fs"] } +tracing = { workspace = true, features = [ + "std", + "attributes", +], optional = true } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-buffer = { workspace = true } diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs index b551c91c9e1..0ef0e7f03f0 100644 --- a/vortex-cuda/benches/bitpacked_cuda.rs +++ b/vortex-cuda/benches/bitpacked_cuda.rs @@ -6,27 +6,25 @@ #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] +mod common; + use std::mem::size_of; use std::ops::Add; +use std::sync::Arc; +use std::sync::atomic::Ordering; use std::time::Duration; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; use cudarc::driver::DeviceRepr; -use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; use futures::executor::block_on; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity::NonNullable; use vortex_buffer::Buffer; -use vortex_cuda::CudaBufferExt; -use vortex_cuda::CudaDeviceBuffer; -use vortex_cuda::CudaExecutionCtx; +use vortex_cuda::BitPackedExecutor; use vortex_cuda::CudaSession; -use vortex_cuda::bitpacked_cuda_kernel; -use vortex_cuda::bitpacked_cuda_launch_config; -use vortex_cuda::launch_cuda_kernel_with_config; +use vortex_cuda::executor::CudaExecute; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; @@ -35,6 +33,8 @@ use vortex_fastlanes::BitPackedArray; use vortex_fastlanes::unpack_iter::BitPacked; use vortex_session::VortexSession; +use crate::common::TimedLaunchStrategy; + const N_ROWS: usize = 100_000_000; /// Create a bit-packed array with the given bit width @@ -56,54 +56,6 @@ where .vortex_expect("failed to create BitPacked array") } -/// Launch the bit unpacking kernel and return elapsed GPU time -fn launch_bitunpack_kernel_timed_typed( - bitpacked_array: &BitPackedArray, - cuda_ctx: &mut CudaExecutionCtx, -) -> vortex_error::VortexResult -where - T: BitPacked + DeviceRepr, - T::Physical: DeviceRepr, -{ - let packed = bitpacked_array.packed().clone(); - let bit_width = bitpacked_array.bit_width(); - let len = bitpacked_array.len(); - - // Move packed data to device if not already there - let device_input = if packed.is_on_device() { - packed - } else { - block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device") - }; - - // Allocate output buffer - let output_slice = cuda_ctx - .device_alloc::(len.next_multiple_of(1024)) - .vortex_expect("failed to allocate output"); - let output_buf = CudaDeviceBuffer::new(output_slice); - - // Get device views - let input_view = device_input - .cuda_view::() - .vortex_expect("failed to get input view"); - let output_view = output_buf.as_view::(); - - let output_width = size_of::() * 8; - let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?; - let mut launch_builder = cuda_ctx.launch_builder(&cuda_function); - - launch_builder.arg(&input_view); - launch_builder.arg(&output_view); - - let config = bitpacked_cuda_launch_config(output_width, len)?; - - // Launch kernel - let events = - launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?; - - events.duration() -} - /// Generic benchmark function for a specific type and bit width fn benchmark_bitunpack_typed(c: &mut Criterion, bit_width: u8, type_name: &str) where @@ -123,19 +75,18 @@ where &array, |b, array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) - .vortex_expect("failed to create execution context"); + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); - let mut total_time = Duration::ZERO; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context") + .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - let kernel_time = - launch_bitunpack_kernel_timed_typed::(array, &mut cuda_ctx) - .vortex_expect("kernel launch failed"); - total_time += kernel_time; + block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap(); } - total_time + Duration::from_nanos(timer.load(Ordering::Relaxed)) }); }, ); diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/common/mod.rs new file mode 100644 index 00000000000..94273ae599d --- /dev/null +++ b/vortex-cuda/benches/common/mod.rs @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; + +use cudarc::driver::sys::CUevent_flags; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; +use vortex_cuda::CudaKernelEvents; +use vortex_cuda::LaunchStrategy; +use vortex_error::VortexResult; + +#[derive(Debug, Default)] +pub struct TimedLaunchStrategy { + total_time_ns: Arc, +} + +impl TimedLaunchStrategy { + pub fn get(&self) -> &Arc { + &self.total_time_ns + } +} + +impl LaunchStrategy for TimedLaunchStrategy { + fn event_flags(&self) -> CUevent_flags { + // using blocking_sync to make sure all events flush before we complete. + CU_EVENT_BLOCKING_SYNC + } + + fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> { + // NOTE: as long as the duration < 584 years this cast is safe. + let elapsed_nanos = events.duration()?.as_nanos() as u64; + self.total_time_ns + .fetch_add(elapsed_nanos, Ordering::Relaxed); + + Ok(()) + } +} diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs index df38a563363..4a142974082 100644 --- a/vortex-cuda/benches/date_time_parts_cuda.rs +++ b/vortex-cuda/benches/date_time_parts_cuda.rs @@ -6,34 +6,37 @@ #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] +mod common; + use std::mem::size_of; +use std::sync::Arc; +use std::sync::atomic::Ordering; use std::time::Duration; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; use futures::executor::block_on; use vortex_array::IntoArray; -use vortex_array::ToCanonical; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; -use vortex_cuda::CudaBufferExt; -use vortex_cuda::CudaExecutionCtx; use vortex_cuda::CudaSession; +use vortex_cuda::DateTimePartsExecutor; +use vortex_cuda::executor::CudaExecute; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_datetime_parts::DateTimePartsArray; use vortex_dtype::DType; use vortex_dtype::Nullability; -use vortex_dtype::PType; use vortex_dtype::datetime::TimeUnit; use vortex_dtype::datetime::Timestamp; use vortex_error::VortexExpect; use vortex_session::VortexSession; +use crate::common::TimedLaunchStrategy; + fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray { let days: Vec = (0..len).map(|i| (i / 1000) as i16).collect(); let days_arr = PrimitiveArray::new(Buffer::from(days), Validity::NonNullable).into_array(); @@ -46,80 +49,6 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr .vortex_expect("Failed to create DateTimePartsArray") } -/// Launches DateTimeParts decode kernel and returns elapsed GPU time. -fn launch_datetimeparts_kernel_timed( - dtp_array: &DateTimePartsArray, - time_unit: TimeUnit, - cuda_ctx: &mut CudaExecutionCtx, -) -> vortex_error::VortexResult { - let days_prim = dtp_array.days().to_primitive(); - - // TODO(0ax1): figure out how to represent constant array in CUDA kernels - let seconds_prim = dtp_array.seconds().to_primitive(); - let subseconds_prim = dtp_array.subseconds().to_primitive(); - - let output_len = dtp_array.len(); - - let divisor: i64 = match time_unit { - TimeUnit::Nanoseconds => 1_000_000_000, - TimeUnit::Microseconds => 1_000_000, - TimeUnit::Milliseconds => 1_000, - TimeUnit::Seconds => 1, - TimeUnit::Days => unreachable!("Days not supported for DateTimeParts"), - }; - - let days_device = block_on( - cuda_ctx - .copy_to_device(days_prim.as_slice::().to_vec()) - .unwrap(), - ) - .vortex_expect("failed to copy days to device"); - - let seconds_device = block_on( - cuda_ctx - .copy_to_device(seconds_prim.as_slice::().to_vec()) - .unwrap(), - ) - .vortex_expect("failed to copy seconds to device"); - - let subseconds_device = block_on( - cuda_ctx - .copy_to_device(subseconds_prim.as_slice::().to_vec()) - .unwrap(), - ) - .vortex_expect("failed to copy subseconds to device"); - - // Allocate output buffer - let output_device = block_on(cuda_ctx.copy_to_device(vec![0i64; output_len]).unwrap()) - .vortex_expect("failed to allocate output buffer"); - - let days_view = days_device - .cuda_view::() - .vortex_expect("failed to get days view"); - let seconds_view = seconds_device - .cuda_view::() - .vortex_expect("failed to get seconds view"); - let subseconds_view = subseconds_device - .cuda_view::() - .vortex_expect("failed to get subseconds view"); - let output_view = output_device - .cuda_view::() - .vortex_expect("failed to get output view"); - - let array_len_u64 = output_len as u64; - - let events = vortex_cuda::launch_cuda_kernel!( - execution_ctx: cuda_ctx, - module: "date_time_parts", - ptypes: &[PType::I16, PType::I8, PType::I8], - launch_args: [days_view, seconds_view, subseconds_view, divisor, output_view, array_len_u64], - event_recording: CU_EVENT_BLOCKING_SYNC, - array_len: output_len - ); - - events.duration() -} - fn benchmark_datetimeparts(c: &mut Criterion) { let mut group = c.benchmark_group("datetimeparts_cuda"); group.sample_size(10); @@ -139,19 +68,22 @@ fn benchmark_datetimeparts(c: &mut Criterion) { &dtp_array, |b, dtp_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) - .vortex_expect("failed to create execution context"); + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); - let mut total_time = Duration::ZERO; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context") + .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - let kernel_time = - launch_datetimeparts_kernel_timed(dtp_array, time_unit, &mut cuda_ctx) - .vortex_expect("kernel launch failed"); - total_time += kernel_time; + // block on immediately here + block_on( + DateTimePartsExecutor.execute(dtp_array.to_array(), &mut cuda_ctx), + ) + .unwrap(); } - total_time + Duration::from_nanos(timer.load(Ordering::Relaxed)) }); }, ); diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs index c555d799a30..5c1ae658b38 100644 --- a/vortex-cuda/benches/dict_cuda.rs +++ b/vortex-cuda/benches/dict_cuda.rs @@ -6,30 +6,34 @@ #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] +mod common; + use std::mem::size_of; +use std::sync::Arc; +use std::sync::atomic::Ordering; use std::time::Duration; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; use cudarc::driver::DeviceRepr; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; use futures::executor::block_on; use vortex_array::IntoArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity::NonNullable; use vortex_buffer::Buffer; -use vortex_cuda::CudaBufferExt; -use vortex_cuda::CudaDeviceBuffer; -use vortex_cuda::CudaExecutionCtx; use vortex_cuda::CudaSession; +use vortex_cuda::DictExecutor; +use vortex_cuda::executor::CudaExecute; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; use vortex_error::VortexExpect; use vortex_session::VortexSession; +use crate::common::TimedLaunchStrategy; + const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")]; /// Configuration for a dictionary benchmark specifying value and code types along with dictionary size. @@ -40,7 +44,7 @@ struct DictBenchConfig { } /// Creates a Dict array with parameterized value type V and code type C. -fn make_dict_array_typed(len: usize, dict_size: usize) -> (DictArray, Vec, Vec) +fn make_dict_array_typed(len: usize, dict_size: usize) -> DictArray where V: NativePType + From, C: NativePType + TryFrom, @@ -50,62 +54,16 @@ where let values: Vec = (0..dict_size) .map(|i| >::from((i * 1000) as u32)) .collect(); - let values_array = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let values_array = PrimitiveArray::new(Buffer::from(values), NonNullable); // Codes cycling through all dictionary values let codes: Vec = (0..len) .map(|i| C::try_from(i % dict_size).unwrap()) .collect(); - let codes_array = PrimitiveArray::new(Buffer::from(codes.clone()), NonNullable); - - let dict_array = DictArray::try_new(codes_array.into_array(), values_array.into_array()) - .vortex_expect("failed to create Dict array"); + let codes_array = PrimitiveArray::new(Buffer::from(codes), NonNullable); - (dict_array, values, codes) -} - -/// Launches Dict decompression kernel and returns elapsed GPU time. -fn launch_dict_kernel_timed_typed( - values: &[V], - codes: &[C], - output_len: usize, - cuda_ctx: &mut CudaExecutionCtx, -) -> vortex_error::VortexResult -where - V: NativePType + DeviceRepr, - C: NativePType + DeviceRepr, -{ - let values_device = block_on(cuda_ctx.copy_to_device(values.to_vec()).unwrap()) - .vortex_expect("failed to copy values to device"); - - let codes_device = block_on(cuda_ctx.copy_to_device(codes.to_vec()).unwrap()) - .vortex_expect("failed to copy codes to device"); - - let output_slice = cuda_ctx - .device_alloc::(output_len) - .vortex_expect("failed to allocate output"); - let output_device = CudaDeviceBuffer::new(output_slice); - - let codes_view = codes_device - .cuda_view::() - .vortex_expect("failed to get codes view"); - let values_view = values_device - .cuda_view::() - .vortex_expect("failed to get values view"); - let output_view = output_device.as_view::(); - - let codes_len_u64 = output_len as u64; - - let events = vortex_cuda::launch_cuda_kernel!( - execution_ctx: cuda_ctx, - module: "dict", - ptypes: &[V::PTYPE, C::PTYPE], - launch_args: [codes_view, codes_len_u64, values_view, output_view], - event_recording: CU_EVENT_BLOCKING_SYNC, - array_len: output_len - ); - - events.duration() + DictArray::try_new(codes_array.into_array(), values_array.into_array()) + .vortex_expect("failed to create Dict array") } /// Benchmark Dict decompression for specific value and code types. @@ -122,7 +80,7 @@ where // Throughput is based on output size (values read from dictionary) group.throughput(Throughput::Bytes((len * size_of::()) as u64)); - let (dict_array, values, codes) = make_dict_array_typed::(*len, config.dict_size); + let dict_array = make_dict_array_typed::(*len, config.dict_size); group.bench_with_input( BenchmarkId::new( @@ -132,26 +90,22 @@ where config.value_type_name, config.code_type_name ), ), - &(dict_array, values, codes), - |b, (dict_array, values, codes)| { + &dict_array, + |b, dict_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) - .vortex_expect("failed to create execution context"); + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); - let mut total_time = Duration::ZERO; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context") + .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - let kernel_time = launch_dict_kernel_timed_typed::( - values, - codes, - dict_array.len(), - &mut cuda_ctx, - ) - .vortex_expect("kernel launch failed"); - total_time += kernel_time; + block_on(DictExecutor.execute(dict_array.to_array(), &mut cuda_ctx)) + .vortex_expect("execute"); } - total_time + Duration::from_nanos(timer.load(Ordering::Relaxed)) }); }, ); diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index dcd76d9ea11..56b50486750 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -6,118 +6,135 @@ #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] +mod common; + use std::mem::size_of; use std::ops::Add; +use std::sync::Arc; +use std::sync::atomic::Ordering; use std::time::Duration; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; use cudarc::driver::DeviceRepr; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; use futures::executor::block_on; use vortex_array::IntoArray; -use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; -use vortex_cuda::CudaBufferExt; -use vortex_cuda::CudaExecutionCtx; use vortex_cuda::CudaSession; +use vortex_cuda::FoRExecutor; +use vortex_cuda::executor::CudaExecute; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; +use vortex_dtype::PType; use vortex_error::VortexExpect; +use vortex_fastlanes::BitPackedArray; use vortex_fastlanes::FoRArray; use vortex_scalar::Scalar; use vortex_session::VortexSession; +use crate::common::TimedLaunchStrategy; + const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")]; const REFERENCE_VALUE: u8 = 10; /// Creates a FoR array with the specified type and length. -fn make_for_array_typed(len: usize) -> FoRArray +fn make_for_array_typed(len: usize, bp: bool) -> FoRArray where T: NativePType + From + Add, Scalar: From, { let reference = >::from(REFERENCE_VALUE); let data: Vec = (0..len) - .map(|i| >::from((i % 256) as u8) + reference) + .map(|i| >::from((i % 256) as u8)) .collect(); let primitive_array = PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array(); - FoRArray::try_new(primitive_array, reference.into()).vortex_expect("failed to create FoR array") + if bp && T::PTYPE != PType::U8 { + let child = + BitPackedArray::encode(primitive_array.as_ref(), 8).vortex_expect("failed to bitpack"); + FoRArray::try_new(child.into_array(), reference.into()) + .vortex_expect("failed to create FoR array") + } else { + FoRArray::try_new(primitive_array, reference.into()) + .vortex_expect("failed to create FoR array") + } } -/// Launches FoR decompression kernel and returns elapsed GPU time. -fn launch_for_kernel_timed_typed( - for_array: &FoRArray, - cuda_ctx: &mut CudaExecutionCtx, -) -> vortex_error::VortexResult +/// Benchmark FoR decompression for a specific type. +fn benchmark_for_typed(c: &mut Criterion, type_name: &str) where - T: NativePType + DeviceRepr + From, + T: NativePType + DeviceRepr + From + Add, + Scalar: From, { - let encoded = for_array.encoded(); - let unpacked_array = encoded.to_primitive(); - let unpacked_slice = unpacked_array.as_slice::(); + let mut group = c.benchmark_group("for_cuda"); + group.sample_size(10); - let device_data = block_on(cuda_ctx.copy_to_device(unpacked_slice.to_vec()).unwrap()) - .vortex_expect("failed to copy to device"); + for &(len, len_str) in BENCH_ARGS { + group.throughput(Throughput::Bytes((len * size_of::()) as u64)); - let reference = >::from(REFERENCE_VALUE); - let array_len_u64 = for_array.len() as u64; - - let device_view = device_data - .cuda_view::() - .vortex_expect("failed to get device view"); - - let events = vortex_cuda::launch_cuda_kernel!( - execution_ctx: cuda_ctx, - module: "for", - ptypes: &[for_array.ptype()], - launch_args: [device_view, reference, array_len_u64], - event_recording: CU_EVENT_BLOCKING_SYNC, - array_len: for_array.len() - ); - - events.duration() + let for_array = make_for_array_typed::(len, false); + + group.bench_with_input( + BenchmarkId::new("for", format!("{len_str}_{type_name}")), + &for_array, + |b, for_array| { + b.iter_custom(|iters| { + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context") + .with_launch_strategy(Arc::new(timed)); + + for _ in 0..iters { + block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap(); + } + + Duration::from_nanos(timer.load(Ordering::Relaxed)) + }); + }, + ); + } + + group.finish(); } -/// Benchmark FoR decompression for a specific type. -fn benchmark_for_typed(c: &mut Criterion, type_name: &str) +fn benchmark_ffor_typed(c: &mut Criterion, type_name: &str) where T: NativePType + DeviceRepr + From + Add, Scalar: From, { - let mut group = c.benchmark_group("for_cuda"); + let mut group = c.benchmark_group("ffor_cuda"); group.sample_size(10); - for (len, len_str) in BENCH_ARGS { + for &(len, len_str) in BENCH_ARGS { group.throughput(Throughput::Bytes((len * size_of::()) as u64)); - let for_array = make_for_array_typed::(*len); + let for_array = make_for_array_typed::(len, true); group.bench_with_input( BenchmarkId::new("for", format!("{len_str}_{type_name}")), &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) - .vortex_expect("failed to create execution context"); + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); - let mut total_time = Duration::ZERO; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context") + .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - let kernel_time = - launch_for_kernel_timed_typed::(for_array, &mut cuda_ctx) - .vortex_expect("kernel launch failed"); - total_time += kernel_time; + block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap(); } - total_time + Duration::from_nanos(timer.load(Ordering::Relaxed)) }); }, ); @@ -134,7 +151,15 @@ fn benchmark_for(c: &mut Criterion) { benchmark_for_typed::(c, "u64"); } -criterion::criterion_group!(benches, benchmark_for); +/// Benchmark FOR+BP decompression for all types. +fn benchmark_ffor(c: &mut Criterion) { + benchmark_ffor_typed::(c, "u8"); + benchmark_ffor_typed::(c, "u16"); + benchmark_ffor_typed::(c, "u32"); + benchmark_ffor_typed::(c, "u64"); +} + +criterion::criterion_group!(benches, benchmark_for, benchmark_ffor); #[cuda_available] criterion::criterion_main!(benches); diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs index cb7b1effbb5..0ce1a37e11b 100644 --- a/vortex-cuda/benches/runend_cuda.rs +++ b/vortex-cuda/benches/runend_cuda.rs @@ -6,31 +6,33 @@ #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] +mod common; + use std::mem::size_of; +use std::sync::Arc; +use std::sync::atomic::Ordering; use std::time::Duration; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; use cudarc::driver::DeviceRepr; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; use futures::executor::block_on; use vortex_array::IntoArray; -use vortex_array::ToCanonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; -use vortex_cuda::CudaBufferExt; -use vortex_cuda::CudaExecutionCtx; use vortex_cuda::CudaSession; +use vortex_cuda::RunEndExecutor; +use vortex_cuda::executor::CudaExecute; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; -use vortex_dtype::PType; -use vortex_error::VortexExpect; use vortex_runend::RunEndArray; use vortex_session::VortexSession; +use crate::common::TimedLaunchStrategy; + /// Creates a run-end encoded array with the specified output length and average run length. fn make_runend_array_typed(output_len: usize, avg_run_len: usize) -> RunEndArray where @@ -56,64 +58,6 @@ where RunEndArray::new(ends_array, values_array) } -/// Launches runend decode kernel and returns elapsed GPU time. -fn launch_runend_kernel_timed_typed( - runend_array: &RunEndArray, - cuda_ctx: &mut CudaExecutionCtx, -) -> vortex_error::VortexResult -where - T: NativePType + DeviceRepr, -{ - let ends_prim = runend_array.ends().to_primitive(); - let values_prim = runend_array.values().to_primitive(); - - let output_len = runend_array.len(); - let num_runs = ends_prim.len(); - let offset = runend_array.offset(); - - let ends_device = block_on( - cuda_ctx - .copy_to_device(ends_prim.as_slice::().to_vec()) - .unwrap(), - ) - .vortex_expect("failed to copy ends to device"); - - let values_device = block_on( - cuda_ctx - .copy_to_device(values_prim.as_slice::().to_vec()) - .unwrap(), - ) - .vortex_expect("failed to copy values to device"); - - let output_device = block_on( - cuda_ctx - .copy_to_device(vec![T::default(); output_len]) - .unwrap(), - ) - .vortex_expect("failed to allocate output buffer"); - - let ends_view = ends_device - .cuda_view::() - .vortex_expect("failed to get ends view"); - let values_view = values_device - .cuda_view::() - .vortex_expect("failed to get values view"); - let output_view = output_device - .cuda_view::() - .vortex_expect("failed to get output view"); - - let events = vortex_cuda::launch_cuda_kernel!( - execution_ctx: cuda_ctx, - module: "runend", - ptypes: &[T::PTYPE, PType::U64], - launch_args: [ends_view, num_runs, values_view, offset, output_len, output_view], - event_recording: CU_EVENT_BLOCKING_SYNC, - array_len: output_len - ); - - events.duration() -} - /// Benchmark run-end decoding for a specific type with varying run lengths fn benchmark_runend_typed(c: &mut Criterion, type_name: &str) where @@ -137,20 +81,22 @@ where &runend_array, |b, runend_array| { b.iter_custom(|iters| { + let timed = TimedLaunchStrategy::default(); + let timer = Arc::clone(timed.get()); + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) - .vortex_expect("failed to create execution context"); - - let mut total_time = Duration::ZERO; + .unwrap() + .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - let kernel_time = - launch_runend_kernel_timed_typed::(runend_array, &mut cuda_ctx) - .vortex_expect("kernel launch failed"); - total_time += kernel_time; + block_on( + RunEndExecutor.execute(runend_array.to_array(), &mut cuda_ctx), + ) + .unwrap(); } - total_time + Duration::from_nanos(timer.load(Ordering::Relaxed)) }); }, ); diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index ff9bac6febd..95091051010 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -12,6 +12,7 @@ use cudarc::driver::CudaSlice; use cudarc::driver::CudaStream; use cudarc::driver::DeviceRepr; use cudarc::driver::LaunchArgs; +use cudarc::driver::LaunchConfig; use futures::future::BoxFuture; use vortex_array::Array; use vortex_array::ArrayRef; @@ -28,8 +29,14 @@ use vortex_error::vortex_err; use crate::CudaSession; use crate::ExportDeviceArray; +use crate::debug; +use crate::kernel::DefaultLaunchStrategy; +use crate::kernel::LaunchStrategy; +use crate::kernel::launch_cuda_kernel_impl; +use crate::kernel::launch_cuda_kernel_with_config; use crate::session::CudaSessionExt; use crate::stream::VortexCudaStream; +use crate::trace; /// CUDA kernel events recorded before and after kernel launch. #[derive(Debug)] @@ -57,6 +64,7 @@ pub struct CudaExecutionCtx { stream: VortexCudaStream, ctx: ExecutionCtx, cuda_session: CudaSession, + strategy: Arc, } impl CudaExecutionCtx { @@ -67,9 +75,68 @@ impl CudaExecutionCtx { stream, ctx, cuda_session, + strategy: Arc::new(DefaultLaunchStrategy), } } + /// Set the launch strategy for the execution context. + /// + /// This can only be set on setup (an "owned" context) and not from within + /// a kernel execution. + pub fn with_launch_strategy(mut self, launch_strategy: Arc) -> Self { + self.strategy = launch_strategy; + self + } + + /// Launch a Kernel function with args setup done by the provided `build_args` closure. + /// + /// Kernels launched this way will use the default launch configuration, which provides no + /// shared memory bytes, and uses grid parameters based on the ideal thread block size for + /// the given `len`. + pub fn launch_kernel<'a, F>( + &'a mut self, + function: &'a CudaFunction, + len: usize, + build_args: F, + ) -> VortexResult<()> + where + F: FnOnce(&mut LaunchArgs<'a>), + { + let mut launcher = self.launch_builder(function); + build_args(&mut launcher); + + let events = launch_cuda_kernel_impl(&mut launcher, self.strategy.event_flags(), len)?; + self.strategy.on_complete(&events, len)?; + + drop(events); + + Ok(()) + } + + /// Launch a function with args provided by the `build_args` closure, with an explicit + /// [`LaunchConfig`], for kernels which need specific grid and shared memory configuration. + pub fn launch_kernel_config<'a, F>( + &'a mut self, + function: &'a CudaFunction, + cfg: LaunchConfig, + len: usize, + build_args: F, + ) -> VortexResult<()> + where + F: FnOnce(&mut LaunchArgs<'a>), + { + let mut launcher = self.launch_builder(function); + build_args(&mut launcher); + + let events = + launch_cuda_kernel_with_config(&mut launcher, cfg, self.strategy.event_flags())?; + self.strategy.on_complete(&events, len)?; + + drop(events); + + Ok(()) + } + /// Loads a CUDA kernel function by module name and ptype(s). /// /// # Arguments @@ -223,18 +290,19 @@ impl CudaArrayExt for ArrayRef { } if self.is_canonical() || self.is_empty() { + trace!(encoding = ?self.encoding_id(), "skipping canonical"); return self.execute(&mut ctx.ctx); } let Some(support) = ctx.cuda_session.kernel(&self.encoding_id()) else { - tracing::debug!( + debug!( encoding = %self.encoding_id(), "No CUDA support registered for encoding, falling back to CPU execution" ); return self.execute(&mut ctx.ctx); }; - tracing::debug!( + debug!( encoding = %self.encoding_id(), "Executing array on CUDA device" ); diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs index f38784e3af7..b9acb133d07 100644 --- a/vortex-cuda/src/kernel/arrays/constant.rs +++ b/vortex-cuda/src/kernel/arrays/constant.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::ConstantArray; @@ -31,13 +30,13 @@ use vortex_error::vortex_err; use crate::CudaDeviceBuffer; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA executor for constant arrays with numeric types. /// /// Materializes a constant array by filling a device buffer with the scalar value. /// Supports primitive types (integers, floats) and decimal types (i128, i256). #[derive(Debug)] +#[doc(hidden)] pub struct ConstantNumericExecutor; impl ConstantNumericExecutor { @@ -48,6 +47,10 @@ impl ConstantNumericExecutor { #[async_trait] impl CudaExecute for ConstantNumericExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -114,16 +117,12 @@ where // Load kernel function let kernel_ptypes = [P::PTYPE]; let cuda_function = ctx.load_function_ptype("constant_numeric", &kernel_ptypes)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - // Build launch args: output, value, length - launch_builder.arg(&output_view); - launch_builder.arg(&value); - launch_builder.arg(&array_len_u64); - - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + ctx.launch_kernel(&cuda_function, array_len, |args| { + args.arg(&output_view); + args.arg(&value); + args.arg(&array_len_u64); + })?; // Wrap the CudaSlice in a CudaDeviceBuffer and then BufferHandle let device_buffer = CudaDeviceBuffer::new(output_buffer); @@ -174,16 +173,12 @@ where // Load kernel function let cuda_function = ctx.load_function("constant_numeric", &[&D::DECIMAL_TYPE.to_string()])?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - - // Build launch args: output, value, length - launch_builder.arg(&output_view); - launch_builder.arg(&value); - launch_builder.arg(&array_len_u64); - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + ctx.launch_kernel(&cuda_function, array_len, |args| { + args.arg(&output_view); + args.arg(&value); + args.arg(&array_len_u64); + })?; // Wrap the CudaSlice in a CudaDeviceBuffer and then BufferHandle let device_buffer = CudaDeviceBuffer::new(output_buffer); diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs index e58c91528aa..3cccb2dddfd 100644 --- a/vortex-cuda/src/kernel/arrays/dict.rs +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -34,14 +34,18 @@ use crate::CudaDeviceBuffer; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA executor for dictionary-encoded arrays. #[derive(Debug)] +#[doc(hidden)] pub struct DictExecutor; #[async_trait] impl CudaExecute for DictExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -127,15 +131,14 @@ async fn execute_dict_prim_typed(); let codes_len_u64 = codes_len as u64; - // Launch the dict kernel - let _cuda_events = crate::launch_cuda_kernel!( - execution_ctx: ctx, - module: "dict", - ptypes: &[value_ptype, I::PTYPE], - launch_args: [codes_view, codes_len_u64, values_view, output_view], - event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING, - array_len: codes_len - ); + + let kernel_function = ctx.load_function_ptype("dict", &[value_ptype, I::PTYPE])?; + ctx.launch_kernel(&kernel_function, codes_len, |args| { + args.arg(&codes_view) + .arg(&codes_len_u64) + .arg(&values_view) + .arg(&output_view); + })?; Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( BufferHandle::new_device(Arc::new(output_device)), @@ -186,6 +189,7 @@ async fn execute_dict_decimal_typed< ) -> VortexResult { assert!(!codes.is_empty()); let codes_len = codes.len(); + let codes_len_u64 = codes_len as u64; if codes_len == 0 { vortex_bail!("Cannot execute dict on empty codes array"); } @@ -230,18 +234,13 @@ async fn execute_dict_decimal_typed< "dict", &[&V::DECIMAL_TYPE.to_string(), &C::PTYPE.to_string()], )?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - launch_builder.arg(&codes_view); - launch_builder.arg(&codes_len); - launch_builder.arg(&values_view); - launch_builder.arg(&output_view); - - let _cuda_events = launch_cuda_kernel_impl( - &mut launch_builder, - cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING, - codes_len, - )?; + ctx.launch_kernel(&cuda_function, codes_len, |args| { + args.arg(&codes_view) + .arg(&codes_len_u64) + .arg(&values_view) + .arg(&output_view); + })?; Ok(Canonical::Decimal(DecimalArray::new_handle( BufferHandle::new_device(Arc::new(output_device)), @@ -311,19 +310,15 @@ async fn execute_dict_varbinview( let codes_ptype_str = C::PTYPE.to_string(); let cuda_function = ctx.load_function("dict", &["i128", &codes_ptype_str])?; - let mut launch_builder = ctx.launch_builder(&cuda_function); let codes_len_u64 = codes_len as u64; - launch_builder.arg(&codes_view); - launch_builder.arg(&codes_len_u64); - launch_builder.arg(&values_view); - launch_builder.arg(&output_view); - - let _cuda_events = launch_cuda_kernel_impl( - &mut launch_builder, - cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING, - codes_len, - )?; + + ctx.launch_kernel(&cuda_function, codes_len, |args| { + args.arg(&codes_view); + args.arg(&codes_len_u64); + args.arg(&values_view); + args.arg(&output_view); + })?; }); // Output views gathered by the kernel share the values' data buffers. diff --git a/vortex-cuda/src/kernel/arrays/shared.rs b/vortex-cuda/src/kernel/arrays/shared.rs index 8bd4e0e3f98..aba9f6ffb43 100644 --- a/vortex-cuda/src/kernel/arrays/shared.rs +++ b/vortex-cuda/src/kernel/arrays/shared.rs @@ -14,10 +14,15 @@ use crate::executor::CudaExecutionCtx; /// CUDA executor for SharedArray. #[derive(Debug)] +#[doc(hidden)] pub struct SharedExecutor; #[async_trait] impl CudaExecute for SharedExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs index de4deb61b4f..e54c437e3cf 100644 --- a/vortex-cuda/src/kernel/encodings/alp.rs +++ b/vortex-cuda/src/kernel/encodings/alp.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_alp::ALPArray; use vortex_alp::ALPFloat; use vortex_alp::ALPVTable; @@ -30,14 +29,18 @@ use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; use crate::kernel::patches::execute_patches; -use crate::launch_cuda_kernel_impl; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] +#[doc(hidden)] pub struct ALPExecutor; #[async_trait] impl CudaExecute for ALPExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -90,20 +93,14 @@ where // Load kernel function let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE]; let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?; - { - let mut launch_builder = ctx.launch_builder(&cuda_function); - - // Build launch args: input, output, f, e, length - launch_builder.arg(&input_view); - launch_builder.arg(&output_view); - launch_builder.arg(&f); - launch_builder.arg(&e); - launch_builder.arg(&array_len_u64); - - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; - } + + ctx.launch_kernel(&cuda_function, array_len, |args| { + args.arg(&input_view) + .arg(&output_view) + .arg(&f) + .arg(&e) + .arg(&array_len_u64); + })?; // Check if there are any patches to decode here let output_buf = if let Some(patches) = array.patches() { diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index edac00aae2a..ff232b54ea8 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -9,7 +9,6 @@ use cudarc::driver::CudaFunction; use cudarc::driver::DeviceRepr; use cudarc::driver::LaunchConfig; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; @@ -32,11 +31,11 @@ use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::kernel::launch_cuda_kernel_with_config; use crate::kernel::patches::execute_patches; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] +#[doc(hidden)] pub struct BitPackedExecutor; impl BitPackedExecutor { @@ -47,6 +46,10 @@ impl BitPackedExecutor { #[async_trait] impl CudaExecute for BitPackedExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -87,7 +90,7 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe }) } -async fn decode_bitpacked( +pub(crate) async fn decode_bitpacked( array: BitPackedArray, ctx: &mut CudaExecutionCtx, ) -> VortexResult @@ -123,18 +126,11 @@ where let output_width = size_of::() * 8; let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?; + let config = bitpacked_cuda_launch_config(output_width, len)?; - { - let mut launch_builder = ctx.launch_builder(&cuda_function); - - launch_builder.arg(&input_view); - launch_builder.arg(&output_view); - - let config = bitpacked_cuda_launch_config(output_width, len)?; - - let _cuda_events = - launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?; - } + ctx.launch_kernel_config(&cuda_function, config, len, |args| { + args.arg(&input_view).arg(&output_view); + })?; let output_handle = match patches { None => BufferHandle::new_device(output_buf.slice_typed::(offset..(offset + len))), diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs index 393779435fa..8932bd0e29c 100644 --- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs +++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs @@ -6,7 +6,6 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; @@ -35,16 +34,20 @@ use crate::CudaDeviceBuffer; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA executor for DateTimeParts arrays. /// /// Combines the days, seconds, and subseconds components into a single i64 timestamp array. #[derive(Debug)] +#[doc(hidden)] pub struct DateTimePartsExecutor; #[async_trait] impl CudaExecute for DateTimePartsExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -196,18 +199,17 @@ where ]; let kernel_suffix_strs: Vec<&str> = kernel_suffixes.iter().map(|s| s.as_str()).collect(); let cuda_function = ctx.load_function("date_time_parts", &kernel_suffix_strs)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - launch_builder.arg(&days_view); - launch_builder.arg(&seconds_view); - launch_builder.arg(&subseconds_view); - launch_builder.arg(&divisor); - launch_builder.arg(&output_view); let array_len_u64 = output_len as u64; - launch_builder.arg(&array_len_u64); - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, output_len)?; + ctx.launch_kernel(&cuda_function, output_len, |args| { + args.arg(&days_view) + .arg(&seconds_view) + .arg(&subseconds_view) + .arg(&divisor) + .arg(&output_view) + .arg(&array_len_u64); + })?; let output_buffer = BufferHandle::new_device(Arc::new(output_device)); let output_primitive = PrimitiveArray::from_buffer_handle(output_buffer, PType::I64, validity); diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs index 57472a1eb5e..37ac1a3e2ed 100644 --- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs +++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs @@ -20,10 +20,15 @@ use crate::executor::CudaExecute; // See `DecimalBytePartsArray` #[derive(Debug)] +#[doc(hidden)] pub struct DecimalBytePartsExecutor; #[async_trait] impl CudaExecute for DecimalBytePartsExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 4a585f52f6e..6b63f7801b8 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -6,7 +6,7 @@ use std::fmt::Debug; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; @@ -25,10 +25,10 @@ use crate::CudaBufferExt; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA decoder for frame-of-reference. #[derive(Debug)] +#[doc(hidden)] pub struct FoRExecutor; impl FoRExecutor { @@ -39,6 +39,10 @@ impl FoRExecutor { #[async_trait] impl CudaExecute for FoRExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -83,16 +87,10 @@ where // Load kernel function let kernel_ptypes = [P::PTYPE]; let cuda_function = ctx.load_function_ptype("for", &kernel_ptypes)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - // Build launch args: buffer, reference, length - launch_builder.arg(&cuda_view); - launch_builder.arg(&reference); - launch_builder.arg(&array_len_u64); - - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + ctx.launch_kernel(&cuda_function, array_len, |args| { + args.arg(&cuda_view).arg(&reference).arg(&array_len_u64); + })?; // Build result - in-place reuses the same buffer Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs index dbd5d15f3b4..110e115474d 100644 --- a/vortex-cuda/src/kernel/encodings/runend.rs +++ b/vortex-cuda/src/kernel/encodings/runend.rs @@ -6,7 +6,6 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::ConstantArray; @@ -32,10 +31,10 @@ use crate::CudaDeviceBuffer; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA executor for run-end encoded arrays. #[derive(Debug)] +#[doc(hidden)] pub struct RunEndExecutor; impl RunEndExecutor { @@ -46,6 +45,10 @@ impl RunEndExecutor { #[async_trait] impl CudaExecute for RunEndExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -136,18 +139,15 @@ async fn decode_runend_typed = kernel_ptypes.iter().map(|s| s.as_str()).collect(); let cuda_function = ctx.load_function("runend", &kernel_ptype_strs)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - - launch_builder.arg(&ends_view); - launch_builder.arg(&num_runs); - launch_builder.arg(&values_view); - launch_builder.arg(&offset); - launch_builder.arg(&output_len); - launch_builder.arg(&output_view); - - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, output_len)?; + + ctx.launch_kernel(&cuda_function, output_len, |args| { + args.arg(&ends_view) + .arg(&num_runs) + .arg(&values_view) + .arg(&offset) + .arg(&output_len) + .arg(&output_view); + })?; let output_validity = match values_validity { Validity::NonNullable => Validity::NonNullable, diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs index daf08bb9654..0a7de984f47 100644 --- a/vortex-cuda/src/kernel/encodings/sequence.rs +++ b/vortex-cuda/src/kernel/encodings/sequence.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use cudarc::driver::PushKernelArg; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; @@ -22,7 +22,6 @@ use vortex_sequence::SequenceVTable; use crate::CudaDeviceBuffer; use crate::CudaExecutionCtx; use crate::executor::CudaExecute; -use crate::launch_cuda_kernel; /// CUDA execution for `SequenceArray`. #[derive(Debug)] @@ -30,6 +29,10 @@ pub struct SequenceExecutor; #[async_trait] impl CudaExecute for SequenceExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -67,14 +70,11 @@ async fn execute_typed( let len_u64 = len as u64; - let _events = launch_cuda_kernel!( - execution_ctx: ctx, - module: "sequence", - ptypes: &[T::PTYPE], - launch_args: [buffer, base, multiplier, len_u64], - event_recording: CU_EVENT_DISABLE_TIMING, - array_len: len - ); + let kernel_func = ctx.load_function_ptype("sequence", &[T::PTYPE])?; + + ctx.launch_kernel(&kernel_func, len, |args| { + args.arg(&buffer).arg(&base).arg(&multiplier).arg(&len_u64); + })?; let output_buf = BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer))); diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs index 1e4c97263b9..c57c7701206 100644 --- a/vortex-cuda/src/kernel/encodings/zigzag.rs +++ b/vortex-cuda/src/kernel/encodings/zigzag.rs @@ -6,7 +6,6 @@ use std::fmt::Debug; use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; @@ -25,7 +24,6 @@ use crate::CudaBufferExt; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::launch_cuda_kernel_impl; /// CUDA decoder for ZigZag decoding. #[derive(Debug)] @@ -39,6 +37,10 @@ impl ZigZagExecutor { #[async_trait] impl CudaExecute for ZigZagExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -86,17 +88,11 @@ where let array_len_u64 = array_len as u64; // Load kernel function - let kernel_ptypes = [U::PTYPE]; - let cuda_function = ctx.load_function_ptype("zigzag", &kernel_ptypes)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); + let cuda_function = ctx.load_function_ptype("zigzag", &[U::PTYPE])?; - // Build launch args: buffer, length - launch_builder.arg(&cuda_view); - launch_builder.arg(&array_len_u64); - - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + ctx.launch_kernel(&cuda_function, array_len, |args| { + args.arg(&cuda_view).arg(&array_len_u64); + })?; // Build result - in-place, reinterpret as signed Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs index d4b68937047..5db6e955ddd 100644 --- a/vortex-cuda/src/kernel/encodings/zstd.rs +++ b/vortex-cuda/src/kernel/encodings/zstd.rs @@ -35,6 +35,7 @@ use vortex_zstd::ZstdVTable; use crate::CudaBufferExt; use crate::CudaDeviceBuffer; +use crate::debug; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; @@ -196,6 +197,10 @@ impl ZstdExecutor { #[async_trait] impl CudaExecute for ZstdExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, @@ -205,9 +210,9 @@ impl CudaExecute for ZstdExecutor { match zstd.as_ref().dtype() { DType::Binary(_) | DType::Utf8(_) => decode_zstd(zstd, ctx).await, - other => { - tracing::debug!( - dtype = %other, + _other => { + debug!( + dtype = %_other, "Only Binary/Utf8 ZSTD arrays supported on GPU, falling back to CPU" ); zstd.decompress()?.to_canonical() @@ -250,6 +255,49 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu let stream = ctx.stream(); + // NOTE(aduffy): we need to use the explicit tracing/not(tracing) blocks here because we go + // through nvcomp instead of delegating through the LaunchBuilder. + // We should find a way to bridge the two. + #[cfg(feature = "tracing")] + { + let before = stream + .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT)) + .map_err(|e| vortex_err!("recording event: {e}"))?; + unsafe { + nvcomp_zstd::decompress_async( + exec.frame_ptrs_ptr as _, + exec.frame_sizes_ptr as _, + exec.output_sizes_ptr as _, + exec.device_actual_sizes.device_ptr_mut(stream).0 as _, + exec.num_frames, + exec.nvcomp_temp_buffer.device_ptr_mut(stream).0 as _, + exec.nvcomp_temp_buffer_size, + exec.output_ptrs_ptr as _, + exec.device_statuses.device_ptr_mut(stream).0 as _, + stream.cu_stream().cast(), + ) + .map_err(|e| vortex_err!("nvcomp decompress_async failed: {}", e))?; + } + + let after = stream + .record_event(Some(cudarc::driver::sys::CUevent_flags::CU_EVENT_DEFAULT)) + .map_err(|e| vortex_err!("recording event: {e}"))?; + + // measure timing. note: this forces a sync + let duration = crate::CudaKernelEvents { + before_launch: before, + after_launch: after, + } + .duration()?; + + crate::trace!( + execution_nanos = duration.as_nanos(), + len = n_rows, + "ZSTD execution" + ); + } + + #[cfg(not(feature = "tracing"))] unsafe { nvcomp_zstd::decompress_async( exec.frame_ptrs_ptr as _, diff --git a/vortex-cuda/src/kernel/filter/mod.rs b/vortex-cuda/src/kernel/filter/mod.rs index 23e12daa167..a0567ebba97 100644 --- a/vortex-cuda/src/kernel/filter/mod.rs +++ b/vortex-cuda/src/kernel/filter/mod.rs @@ -42,6 +42,10 @@ pub struct FilterExecutor; #[async_trait] impl CudaExecute for FilterExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 9dca93a0e94..b247e8367f3 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -34,49 +34,54 @@ pub use filter::FilterExecutor; pub use slice::SliceExecutor; use crate::CudaKernelEvents; +#[cfg(feature = "tracing")] +use crate::trace; -/// Convenience macro to launch a CUDA kernel. +/// Trait for customizing kernel launch behavior. /// -/// The kernel gets launched on the stream of the execution context. -/// -/// The kernel launch config: -/// LaunchConfig { -/// grid_dim: (array.len() / 2048, 1, 1), -/// block_dim: (64, 1, 1), -/// shared_mem_bytes: 0, -/// }; -/// 64 threads are used per block which corresponds to 2 warps. -/// Each block handles 2048 elements. Each thread handles 32 elements. -/// The last block and thread are allowed to have less elements. -/// -/// Note: A macro is necessary to unroll the launch builder arguments. -/// -/// # Returns -/// -/// A pair of CUDA events submitted before and after the kernel. -/// Depending on `CUevent_flags` these events can contain timestamps. Use -/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to -/// enable timestamps. -#[macro_export] -macro_rules! launch_cuda_kernel { - ( - execution_ctx: $ctx:expr, - module: $module:expr, - ptypes: $ptypes:expr, - launch_args: [$($arg:expr),* $(,)?], - event_recording: $event_recording:expr, - array_len: $len:expr - ) => {{ - use ::cudarc::driver::PushKernelArg as _; - let cuda_function = $ctx.load_function_ptype($module, $ptypes)?; - let mut launch_builder = $ctx.launch_builder(&cuda_function); - - $( - launch_builder.arg(&$arg); - )* - - $crate::launch_cuda_kernel_impl(&mut launch_builder, $event_recording, $len)? - }}; +/// Implementations can add tracing, async callbacks, or other behavior +/// around kernel launches. +pub trait LaunchStrategy: Debug + Send + Sync + 'static { + /// Returns the event flags to use for this launch. + fn event_flags(&self) -> CUevent_flags; + + /// Called after the kernel launch completes with the recorded events. + fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()>; +} + +/// Default launch strategy with no tracing overhead. +#[derive(Debug)] +pub struct DefaultLaunchStrategy; + +impl LaunchStrategy for DefaultLaunchStrategy { + fn event_flags(&self) -> CUevent_flags { + CUevent_flags::CU_EVENT_DISABLE_TIMING + } + + fn on_complete(&self, _events: &CudaKernelEvents, _len: usize) -> VortexResult<()> { + Ok(()) + } +} + +/// Launch strategy that records timing and emits trace events. +#[cfg(feature = "tracing")] +#[derive(Debug)] +pub struct TracingLaunchStrategy; + +#[cfg(feature = "tracing")] +impl LaunchStrategy for TracingLaunchStrategy { + fn event_flags(&self) -> CUevent_flags { + CUevent_flags::CU_EVENT_DEFAULT + } + + fn on_complete(&self, events: &CudaKernelEvents, len: usize) -> VortexResult<()> { + let duration = events.duration()?; + trace!( + execution_nanos = duration.as_nanos(), + len, "execution completed" + ); + Ok(()) + } } /// Launches a CUDA kernel with the passed launch builder. @@ -92,7 +97,7 @@ macro_rules! launch_cuda_kernel { /// Depending on `CUevent_flags` these events can contain timestamps. Use /// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to /// enable timestamps. -pub fn launch_cuda_kernel_impl( +pub(crate) fn launch_cuda_kernel_impl( launch_builder: &mut LaunchArgs, event_flags: CUevent_flags, array_len: usize, @@ -127,7 +132,7 @@ pub fn launch_cuda_kernel_impl( /// Depending on `CUevent_flags` these events can contain timestamps. Use /// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to /// enable timestamps. -pub fn launch_cuda_kernel_with_config( +pub(crate) fn launch_cuda_kernel_with_config( launch_builder: &mut LaunchArgs, config: LaunchConfig, event_flags: CUevent_flags, @@ -153,7 +158,7 @@ pub fn launch_cuda_kernel_with_config( /// /// Handles loading PTX files, compiling modules, and loading functions. #[derive(Debug)] -pub struct KernelLoader { +pub(crate) struct KernelLoader { /// Cache of loaded CUDA modules, keyed by module name modules: DashMap>, } diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index 9e326a6367e..3761cd1ba1e 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -2,7 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use cudarc::driver::DeviceRepr; -use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use cudarc::driver::PushKernelArg; use vortex_array::arrays::PrimitiveArrayParts; use vortex_array::patches::Patches; use vortex_array::validity::Validity; @@ -16,7 +16,6 @@ use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::CudaExecutionCtx; use crate::executor::CudaArrayExt; -use crate::launch_cuda_kernel; /// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`. pub(crate) async fn execute_patches< @@ -86,20 +85,14 @@ pub(crate) async fn execute_patches< let d_patch_indices_view = d_patch_indices.cuda_view::()?; let d_patch_values_view = d_patch_values.cuda_view::()?; - // kernel arg order for patches is values, patchIndices, patchValues, patchesLen - let _events = launch_cuda_kernel!( - execution_ctx: ctx, - module: "patches", - ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE], - launch_args: [ - d_target_view, - d_patch_indices_view, - d_patch_values_view, - patches_len_u64, - ], - event_recording: CU_EVENT_DISABLE_TIMING, - array_len: patches_len - ); + let kernel_func = ctx.load_function_ptype("patches", &[ValuesT::PTYPE, IndicesT::PTYPE])?; + + ctx.launch_kernel(&kernel_func, patches_len, |args| { + args.arg(&d_target_view) + .arg(&d_patch_indices_view) + .arg(&d_patch_values_view) + .arg(&patches_len_u64); + })?; Ok(target) } diff --git a/vortex-cuda/src/kernel/slice/mod.rs b/vortex-cuda/src/kernel/slice/mod.rs index 4b19dfc746e..13922bba805 100644 --- a/vortex-cuda/src/kernel/slice/mod.rs +++ b/vortex-cuda/src/kernel/slice/mod.rs @@ -19,6 +19,10 @@ pub struct SliceExecutor; #[async_trait] impl CudaExecute for SliceExecutor { + #[cfg_attr( + feature = "tracing", + tracing::instrument(level = "trace", skip_all, fields(self)) + )] async fn execute( &self, array: ArrayRef, diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index d3a3e6d0162..f19489aa3de 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -12,6 +12,7 @@ pub mod dynamic_dispatch; pub mod executor; mod host_to_device_allocator; mod kernel; +mod macros; mod session; mod stream; mod stream_pool; @@ -23,25 +24,27 @@ pub use device_buffer::CudaDeviceBuffer; pub use executor::CudaExecutionCtx; pub use executor::CudaKernelEvents; pub use host_to_device_allocator::CopyDeviceReadAt; -use kernel::ALPExecutor; -use kernel::BitPackedExecutor; -use kernel::ConstantNumericExecutor; -use kernel::DateTimePartsExecutor; -use kernel::DecimalBytePartsExecutor; -use kernel::DictExecutor; -use kernel::FilterExecutor; -use kernel::FoRExecutor; -use kernel::RunEndExecutor; -use kernel::SharedExecutor; -use kernel::ZigZagExecutor; +pub use kernel::ALPExecutor; +pub use kernel::BitPackedExecutor; +pub use kernel::ConstantNumericExecutor; +pub use kernel::DateTimePartsExecutor; +pub use kernel::DecimalBytePartsExecutor; +pub use kernel::DefaultLaunchStrategy; +pub use kernel::DictExecutor; +pub use kernel::FilterExecutor; +pub use kernel::FoRExecutor; +pub use kernel::LaunchStrategy; +pub use kernel::RunEndExecutor; +pub use kernel::SharedExecutor; +#[cfg(feature = "tracing")] +pub use kernel::TracingLaunchStrategy; +pub use kernel::ZigZagExecutor; #[cfg(feature = "unstable_encodings")] use kernel::ZstdBuffersExecutor; use kernel::ZstdExecutor; pub use kernel::ZstdKernelPrep; pub use kernel::bitpacked_cuda_kernel; pub use kernel::bitpacked_cuda_launch_config; -pub use kernel::launch_cuda_kernel_impl; -pub use kernel::launch_cuda_kernel_with_config; pub use kernel::zstd_kernel_prepare; pub use session::CudaSession; pub use session::CudaSessionExt; @@ -64,8 +67,8 @@ use vortex_zigzag::ZigZagVTable; use vortex_zstd::ZstdBuffersVTable; use vortex_zstd::ZstdVTable; -use crate::kernel::SequenceExecutor; -use crate::kernel::SliceExecutor; +pub use crate::kernel::SequenceExecutor; +pub use crate::kernel::SliceExecutor; /// Checks if CUDA is available on the system by looking for nvcc. pub fn cuda_available() -> bool { @@ -77,7 +80,7 @@ pub fn cuda_available() -> bool { /// Registers CUDA kernels. pub fn initialize_cuda(session: &CudaSession) { - tracing::info!("Registering CUDA kernels"); + info!("Registering CUDA kernels"); session.register_kernel(ALPVTable::ID, &ALPExecutor); session.register_kernel(BitPackedVTable::ID, &BitPackedExecutor); session.register_kernel(ConstantVTable::ID, &ConstantNumericExecutor); diff --git a/vortex-cuda/src/macros.rs b/vortex-cuda/src/macros.rs new file mode 100644 index 00000000000..e537995cd3e --- /dev/null +++ b/vortex-cuda/src/macros.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#[macro_export] +macro_rules! warn { + ($($tts:tt)*) => { + #[cfg(feature = "tracing")] + { + tracing::warn!($($tts)*); + } + }; +} + +#[macro_export] +macro_rules! info { + ($($tts:tt)*) => { + #[cfg(feature = "tracing")] + { + tracing::info!($($tts)*); + } + }; +} + +#[macro_export] +macro_rules! debug { + ($($tts:tt)*) => { + #[cfg(feature = "tracing")] + { + tracing::info!($($tts)*); + } + }; +} + +#[macro_export] +macro_rules! trace { + ($($tts:tt)*) => { + #[cfg(feature = "tracing")] + { + tracing::info!($($tts)*); + } + }; +} diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 33233582116..14e42078f38 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -16,6 +16,7 @@ use crate::ExportDeviceArray; use crate::arrow::CanonicalDeviceArrayExport; use crate::executor::CudaExecute; pub use crate::executor::CudaExecutionCtx; +use crate::initialize_cuda; use crate::kernel::KernelLoader; use crate::stream::VortexCudaStream; use crate::stream_pool::VortexCudaStreamPool; @@ -128,7 +129,7 @@ impl CudaSession { } impl Default for CudaSession { - /// Creates a default CUDA session using device 0. + /// Creates a default CUDA session using device 0, with all GPU array kernels preloaded. /// /// # Panics /// @@ -136,7 +137,9 @@ impl Default for CudaSession { fn default() -> Self { #[expect(clippy::expect_used)] let context = CudaContext::new(0).expect("Failed to initialize CUDA device 0"); - Self::new(context) + let this = Self::new(context); + initialize_cuda(&this); + this } } diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs index fad54b36ab4..449cd6db072 100644 --- a/vortex-cuda/src/stream.rs +++ b/vortex-cuda/src/stream.rs @@ -19,6 +19,7 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use crate::CudaDeviceBuffer; +use crate::warn; #[derive(Clone)] pub struct VortexCudaStream(pub Arc); @@ -155,10 +156,11 @@ fn register_stream_callback(stream: &CudaStream) -> VortexResult) }; // Blocking send as we're in a callback invoked by the CUDA driver. - #[expect(clippy::expect_used)] - tx.send(()) - // A send should never fail. Panic otherwise. - .expect("CUDA callback receiver dropped unexpectedly"); + // NOTE: send can fail if the CudaEvent is dropped by the caller, in which case the reeciver + // is closed and sends will fail. + if let Err(_e) = tx.send(()) { + warn!(error = ?_e, "register_stream_callback send failed due to error"); + } } // SAFETY: diff --git a/vortex-python/src/arrow.rs b/vortex-python/src/arrow.rs index ece662ee80e..861602f08c9 100644 --- a/vortex-python/src/arrow.rs +++ b/vortex-python/src/arrow.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2016-2025 Copyright The Apache Software Foundation // SPDX-FileCopyrightText: 2025 Copyright the Vortex contributors // SPDX-License-Identifier: Apache-2.0 -// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/lib.rs at commit 549709fb at https://github.com/apache/arrow-rs +// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/main at commit 549709fb at https://github.com/apache/arrow-rs // SPDX-FileNotice: https://github.com/apache/arrow-rs/blob/549709fbdf91cd1f6c263a7e4540c542b6fecf6b/NOTICE.txt #![allow(clippy::same_name_method)] diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-test/e2e-cuda-scan/Cargo.toml new file mode 100644 index 00000000000..2e7d53e2f75 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "vortex-test-e2e-cuda-scan" +authors = { workspace = true } +description = "CUDA scan testing" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +publish = false +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +futures = { workspace = true, features = ["executor"] } +tokio = { workspace = true, features = ["macros", "full"] } +tracing = { workspace = true, features = ["std", "attributes"] } +tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } +vortex = { workspace = true } +vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"] } diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-test/e2e-cuda-scan/src/main.rs new file mode 100644 index 00000000000..b82af917193 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/src/main.rs @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::env::args; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use futures::StreamExt; +use tracing::Instrument; +use tracing_subscriber::EnvFilter; +use tracing_subscriber::fmt::format::FmtSpan; +use vortex::VortexSessionDefault; +use vortex::array::ToCanonical; +use vortex::array::arrays::DictVTable; +use vortex::buffer::ByteBuffer; +use vortex::buffer::ByteBufferMut; +use vortex::compressor::BtrBlocksCompressorBuilder; +use vortex::compressor::FloatCode; +use vortex::compressor::IntCode; +use vortex::compressor::StringCode; +use vortex::error::VortexResult; +use vortex::file::Footer; +use vortex::file::OpenOptionsSessionExt; +use vortex::file::WriteOptionsSessionExt; +use vortex::file::WriteStrategyBuilder; +use vortex::session::VortexSession; +use vortex_cuda::CopyDeviceReadAt; +use vortex_cuda::CudaSession; +use vortex_cuda::TracingLaunchStrategy; +use vortex_cuda::VortexCudaStreamPool; +use vortex_cuda::executor::CudaArrayExt; + +#[tokio::main] +pub async fn main() -> VortexResult<()> { + let args: Vec = args().collect(); + let json_output = args.iter().any(|arg| arg == "--json"); + + if json_output { + tracing_subscriber::fmt() + .json() + .with_env_filter(EnvFilter::from_default_env()) + .with_span_events(FmtSpan::NONE) + .with_ansi(false) + .init(); + } else { + tracing_subscriber::fmt() + .pretty() + .with_env_filter(EnvFilter::from_default_env()) + .with_span_events(FmtSpan::NONE) + .with_ansi(false) + .event_format(tracing_subscriber::fmt::format().with_target(true)) + .init(); + } + + let session = VortexSession::default(); + let mut cuda_ctx = CudaSession::create_execution_ctx(&session)? + .with_launch_strategy(Arc::new(TracingLaunchStrategy)); + + #[allow(clippy::expect_used, clippy::unwrap_in_result)] + let input_path = args + .iter() + .skip(1) + .find(|arg| !arg.starts_with("--")) + .expect("must provide path to .vortex file"); + let input_path = PathBuf::from(input_path); + + assert!(input_path.exists(), "input path does not exist"); + + let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?; + + // Create a full scan that executes on the GPU + let cuda_stream = + VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?; + let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream); + + let gpu_file = session + .open_options() + .with_footer(footer) + .open(Arc::new(gpu_reader)) + .await?; + + // execute_micros => µs to execute + let mut batches = gpu_file.scan()?.into_array_stream()?; + + let mut chunk = 0; + while let Some(next) = batches.next().await.transpose()? { + let record = next.to_struct(); + + for (field, field_name) in record + .unmasked_fields() + .iter() + .zip(record.struct_fields().names().iter()) + { + let field_name = field_name.to_string(); + // skip dict, varbin isn't properly implemented. + if field.is::() { + continue; + } + + let span = + tracing::info_span!("array execution", chunk = chunk, field_name = field_name); + + async { + if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() { + tracing::error!("failed to execute_cuda on column"); + } + } + .instrument(span) + .await; + } + + chunk += 1; + } + + Ok(()) +} + +// Dump the values out as a new Vortex file for analysis. + +/// Recompress the input file using only GPU-executable encodings, returning the file as an +/// in-memory byte array. +async fn recompress_for_gpu( + input_path: impl AsRef, + session: &VortexSession, +) -> VortexResult<(ByteBuffer, Footer)> { + // Setup the reader + let input = session.open_options().open_path(input_path).await?; + + // Build a scan to read all columns from the input, and recompress them using only GPU-compatible + // encodings. + let scan = input.scan()?.into_array_stream()?; + + // Rebuild a copy of the file that only uses GPU-compatible compression algorithms. + let compressor = BtrBlocksCompressorBuilder::empty() + .include_int([ + IntCode::Uncompressed, + IntCode::Constant, + IntCode::BitPacking, + IntCode::For, + IntCode::Sequence, + IntCode::ZigZag, + IntCode::Dict, + ]) + .include_float([ + FloatCode::Uncompressed, + FloatCode::Constant, + FloatCode::Alp, + FloatCode::AlpRd, + FloatCode::RunEnd, + ]) + // Don't compress strings, this is b/c we don't have any BtrBlocks encodings that support + // strings. + .include_string([ + StringCode::Uncompressed, + StringCode::Constant, + StringCode::Dict, + StringCode::Zstd, + StringCode::ZstdBuffers, + ]) + .build(); + + // Read an input stream from a Vortex file. + let writer = WriteStrategyBuilder::default() + .with_compressor(compressor) + .build(); + + // Segment sink? + let mut out = ByteBufferMut::empty(); + let result = session + .write_options() + .with_strategy(writer) + .write(&mut out, scan) + .await?; + + Ok((out.freeze(), result.footer().clone())) +} diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index 23d12726e87..c989d634cb2 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -33,6 +33,9 @@ pub mod compute2 { pub mod compressor { pub use vortex_btrblocks::BtrBlocksCompressor; pub use vortex_btrblocks::BtrBlocksCompressorBuilder; + pub use vortex_btrblocks::FloatCode; + pub use vortex_btrblocks::IntCode; + pub use vortex_btrblocks::StringCode; } pub mod dtype { From 16ad0e64f541fcb468e4d18948b10309d4fa3f71 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 13 Feb 2026 09:59:57 -0500 Subject: [PATCH 02/11] no pub *Executor Signed-off-by: Andrew Duffy --- vortex-cuda/benches/bitpacked_cuda.rs | 5 ++-- vortex-cuda/benches/date_time_parts_cuda.rs | 8 ++---- vortex-cuda/benches/dict_cuda.rs | 5 ++-- vortex-cuda/benches/for_cuda.rs | 7 ++--- vortex-cuda/benches/runend_cuda.rs | 8 ++---- vortex-cuda/src/kernel/arrays/constant.rs | 3 +- vortex-cuda/src/kernel/arrays/dict.rs | 3 +- vortex-cuda/src/kernel/arrays/mod.rs | 6 ++-- vortex-cuda/src/kernel/arrays/shared.rs | 3 +- vortex-cuda/src/kernel/encodings/alp.rs | 3 +- vortex-cuda/src/kernel/encodings/bitpacked.rs | 3 +- .../src/kernel/encodings/date_time_parts.rs | 3 +- .../kernel/encodings/decimal_byte_parts.rs | 3 +- vortex-cuda/src/kernel/encodings/for_.rs | 3 +- vortex-cuda/src/kernel/encodings/mod.rs | 22 +++++++-------- vortex-cuda/src/kernel/encodings/runend.rs | 3 +- vortex-cuda/src/kernel/encodings/sequence.rs | 2 +- vortex-cuda/src/kernel/encodings/zigzag.rs | 2 +- vortex-cuda/src/kernel/encodings/zstd.rs | 2 +- .../src/kernel/encodings/zstd_buffers.rs | 2 +- vortex-cuda/src/kernel/mod.rs | 14 ++++++---- vortex-cuda/src/lib.rs | 28 +++++++++---------- 22 files changed, 58 insertions(+), 80 deletions(-) diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs index 0ef0e7f03f0..fb859607e12 100644 --- a/vortex-cuda/benches/bitpacked_cuda.rs +++ b/vortex-cuda/benches/bitpacked_cuda.rs @@ -22,9 +22,8 @@ use futures::executor::block_on; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity::NonNullable; use vortex_buffer::Buffer; -use vortex_cuda::BitPackedExecutor; use vortex_cuda::CudaSession; -use vortex_cuda::executor::CudaExecute; +use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; @@ -83,7 +82,7 @@ where .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - block_on(BitPackedExecutor.execute(array.to_array(), &mut cuda_ctx)).unwrap(); + block_on(array.to_array().execute_cuda(&mut cuda_ctx)).unwrap(); } Duration::from_nanos(timer.load(Ordering::Relaxed)) diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs index 4a142974082..f378630cd8b 100644 --- a/vortex-cuda/benches/date_time_parts_cuda.rs +++ b/vortex-cuda/benches/date_time_parts_cuda.rs @@ -23,8 +23,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_cuda::CudaSession; -use vortex_cuda::DateTimePartsExecutor; -use vortex_cuda::executor::CudaExecute; +use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_datetime_parts::DateTimePartsArray; @@ -77,10 +76,7 @@ fn benchmark_datetimeparts(c: &mut Criterion) { for _ in 0..iters { // block on immediately here - block_on( - DateTimePartsExecutor.execute(dtp_array.to_array(), &mut cuda_ctx), - ) - .unwrap(); + block_on(dtp_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap(); } Duration::from_nanos(timer.load(Ordering::Relaxed)) diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs index 5c1ae658b38..5ef61bc7d6c 100644 --- a/vortex-cuda/benches/dict_cuda.rs +++ b/vortex-cuda/benches/dict_cuda.rs @@ -24,8 +24,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity::NonNullable; use vortex_buffer::Buffer; use vortex_cuda::CudaSession; -use vortex_cuda::DictExecutor; -use vortex_cuda::executor::CudaExecute; +use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; @@ -101,7 +100,7 @@ where .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - block_on(DictExecutor.execute(dict_array.to_array(), &mut cuda_ctx)) + block_on(dict_array.to_array().execute_cuda(&mut cuda_ctx)) .vortex_expect("execute"); } diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index 56b50486750..ce45b3d6041 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -24,8 +24,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_cuda::CudaSession; -use vortex_cuda::FoRExecutor; -use vortex_cuda::executor::CudaExecute; +use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; @@ -93,7 +92,7 @@ where .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap(); + block_on(for_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap(); } Duration::from_nanos(timer.load(Ordering::Relaxed)) @@ -131,7 +130,7 @@ where .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - block_on(FoRExecutor.execute(for_array.to_array(), &mut cuda_ctx)).unwrap(); + block_on(for_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap(); } Duration::from_nanos(timer.load(Ordering::Relaxed)) diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs index 0ce1a37e11b..7e5f4e5906d 100644 --- a/vortex-cuda/benches/runend_cuda.rs +++ b/vortex-cuda/benches/runend_cuda.rs @@ -23,8 +23,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_cuda::CudaSession; -use vortex_cuda::RunEndExecutor; -use vortex_cuda::executor::CudaExecute; +use vortex_cuda::executor::CudaArrayExt; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; @@ -90,10 +89,7 @@ where .with_launch_strategy(Arc::new(timed)); for _ in 0..iters { - block_on( - RunEndExecutor.execute(runend_array.to_array(), &mut cuda_ctx), - ) - .unwrap(); + block_on(runend_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap(); } Duration::from_nanos(timer.load(Ordering::Relaxed)) diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs index b9acb133d07..8f0451d31e6 100644 --- a/vortex-cuda/src/kernel/arrays/constant.rs +++ b/vortex-cuda/src/kernel/arrays/constant.rs @@ -36,8 +36,7 @@ use crate::executor::CudaExecutionCtx; /// Materializes a constant array by filling a device buffer with the scalar value. /// Supports primitive types (integers, floats) and decimal types (i128, i256). #[derive(Debug)] -#[doc(hidden)] -pub struct ConstantNumericExecutor; +pub(crate) struct ConstantNumericExecutor; impl ConstantNumericExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs index 3cccb2dddfd..620270816f1 100644 --- a/vortex-cuda/src/kernel/arrays/dict.rs +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -37,8 +37,7 @@ use crate::executor::CudaExecutionCtx; /// CUDA executor for dictionary-encoded arrays. #[derive(Debug)] -#[doc(hidden)] -pub struct DictExecutor; +pub(crate) struct DictExecutor; #[async_trait] impl CudaExecute for DictExecutor { diff --git a/vortex-cuda/src/kernel/arrays/mod.rs b/vortex-cuda/src/kernel/arrays/mod.rs index dc3a9a80fba..ab81934bb27 100644 --- a/vortex-cuda/src/kernel/arrays/mod.rs +++ b/vortex-cuda/src/kernel/arrays/mod.rs @@ -5,6 +5,6 @@ mod constant; mod dict; mod shared; -pub use constant::ConstantNumericExecutor; -pub use dict::DictExecutor; -pub use shared::SharedExecutor; +pub(crate) use constant::ConstantNumericExecutor; +pub(crate) use dict::DictExecutor; +pub(crate) use shared::SharedExecutor; diff --git a/vortex-cuda/src/kernel/arrays/shared.rs b/vortex-cuda/src/kernel/arrays/shared.rs index aba9f6ffb43..cae0dc68988 100644 --- a/vortex-cuda/src/kernel/arrays/shared.rs +++ b/vortex-cuda/src/kernel/arrays/shared.rs @@ -14,8 +14,7 @@ use crate::executor::CudaExecutionCtx; /// CUDA executor for SharedArray. #[derive(Debug)] -#[doc(hidden)] -pub struct SharedExecutor; +pub(crate) struct SharedExecutor; #[async_trait] impl CudaExecute for SharedExecutor { diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs index e54c437e3cf..5dc9d8c6d0d 100644 --- a/vortex-cuda/src/kernel/encodings/alp.rs +++ b/vortex-cuda/src/kernel/encodings/alp.rs @@ -32,8 +32,7 @@ use crate::kernel::patches::execute_patches; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] -#[doc(hidden)] -pub struct ALPExecutor; +pub(crate) struct ALPExecutor; #[async_trait] impl CudaExecute for ALPExecutor { diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index ff232b54ea8..a4455ab536c 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -35,8 +35,7 @@ use crate::kernel::patches::execute_patches; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] -#[doc(hidden)] -pub struct BitPackedExecutor; +pub(crate) struct BitPackedExecutor; impl BitPackedExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs index 8932bd0e29c..0adde430bee 100644 --- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs +++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs @@ -39,8 +39,7 @@ use crate::executor::CudaExecutionCtx; /// /// Combines the days, seconds, and subseconds components into a single i64 timestamp array. #[derive(Debug)] -#[doc(hidden)] -pub struct DateTimePartsExecutor; +pub(crate) struct DateTimePartsExecutor; #[async_trait] impl CudaExecute for DateTimePartsExecutor { diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs index 37ac1a3e2ed..042de8802d2 100644 --- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs +++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs @@ -20,8 +20,7 @@ use crate::executor::CudaExecute; // See `DecimalBytePartsArray` #[derive(Debug)] -#[doc(hidden)] -pub struct DecimalBytePartsExecutor; +pub(crate) struct DecimalBytePartsExecutor; #[async_trait] impl CudaExecute for DecimalBytePartsExecutor { diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 6b63f7801b8..6ec3bc5772b 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -28,8 +28,7 @@ use crate::executor::CudaExecutionCtx; /// CUDA decoder for frame-of-reference. #[derive(Debug)] -#[doc(hidden)] -pub struct FoRExecutor; +pub(crate) struct FoRExecutor; impl FoRExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs index b26ca50e1cc..62a8d9f606d 100644 --- a/vortex-cuda/src/kernel/encodings/mod.rs +++ b/vortex-cuda/src/kernel/encodings/mod.rs @@ -13,18 +13,16 @@ mod zstd; #[cfg(feature = "unstable_encodings")] mod zstd_buffers; -pub use alp::ALPExecutor; -pub use bitpacked::BitPackedExecutor; -pub use bitpacked::bitpacked_cuda_kernel; -pub use bitpacked::bitpacked_cuda_launch_config; -pub use date_time_parts::DateTimePartsExecutor; -pub use decimal_byte_parts::DecimalBytePartsExecutor; -pub use for_::FoRExecutor; -pub use runend::RunEndExecutor; -pub use sequence::SequenceExecutor; -pub use zigzag::ZigZagExecutor; -pub use zstd::ZstdExecutor; +pub(crate) use alp::ALPExecutor; +pub(crate) use bitpacked::BitPackedExecutor; +pub(crate) use date_time_parts::DateTimePartsExecutor; +pub(crate) use decimal_byte_parts::DecimalBytePartsExecutor; +pub(crate) use for_::FoRExecutor; +pub(crate) use runend::RunEndExecutor; +pub(crate) use sequence::SequenceExecutor; +pub(crate) use zigzag::ZigZagExecutor; +pub(crate) use zstd::ZstdExecutor; pub use zstd::ZstdKernelPrep; pub use zstd::zstd_kernel_prepare; #[cfg(feature = "unstable_encodings")] -pub use zstd_buffers::ZstdBuffersExecutor; +pub(crate) use zstd_buffers::ZstdBuffersExecutor; diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs index 110e115474d..a2d35d869e1 100644 --- a/vortex-cuda/src/kernel/encodings/runend.rs +++ b/vortex-cuda/src/kernel/encodings/runend.rs @@ -34,8 +34,7 @@ use crate::executor::CudaExecutionCtx; /// CUDA executor for run-end encoded arrays. #[derive(Debug)] -#[doc(hidden)] -pub struct RunEndExecutor; +pub(crate) struct RunEndExecutor; impl RunEndExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs index 0a7de984f47..354e26b0629 100644 --- a/vortex-cuda/src/kernel/encodings/sequence.rs +++ b/vortex-cuda/src/kernel/encodings/sequence.rs @@ -25,7 +25,7 @@ use crate::executor::CudaExecute; /// CUDA execution for `SequenceArray`. #[derive(Debug)] -pub struct SequenceExecutor; +pub(crate) struct SequenceExecutor; #[async_trait] impl CudaExecute for SequenceExecutor { diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs index c57c7701206..c42c34d0624 100644 --- a/vortex-cuda/src/kernel/encodings/zigzag.rs +++ b/vortex-cuda/src/kernel/encodings/zigzag.rs @@ -27,7 +27,7 @@ use crate::executor::CudaExecutionCtx; /// CUDA decoder for ZigZag decoding. #[derive(Debug)] -pub struct ZigZagExecutor; +pub(crate) struct ZigZagExecutor; impl ZigZagExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs index 5db6e955ddd..c9980415c62 100644 --- a/vortex-cuda/src/kernel/encodings/zstd.rs +++ b/vortex-cuda/src/kernel/encodings/zstd.rs @@ -187,7 +187,7 @@ pub async fn zstd_kernel_prepare( /// CUDA executor for ZSTD decompression using nvCOMP. #[derive(Debug)] -pub struct ZstdExecutor; +pub(crate) struct ZstdExecutor; impl ZstdExecutor { fn try_specialize(array: ArrayRef) -> Option { diff --git a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs index 56207741cbf..509ed41f121 100644 --- a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs +++ b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs @@ -34,7 +34,7 @@ use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; #[derive(Debug)] -pub struct ZstdBuffersExecutor; +pub(crate) struct ZstdBuffersExecutor; #[async_trait] impl CudaExecute for ZstdBuffersExecutor { diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index b247e8367f3..0bca57a15e1 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -26,12 +26,14 @@ mod filter; mod patches; mod slice; -pub use arrays::ConstantNumericExecutor; -pub use arrays::DictExecutor; -pub use arrays::SharedExecutor; -pub use encodings::*; -pub use filter::FilterExecutor; -pub use slice::SliceExecutor; +pub(crate) use arrays::ConstantNumericExecutor; +pub(crate) use arrays::DictExecutor; +pub(crate) use arrays::SharedExecutor; +pub use encodings::ZstdKernelPrep; +pub use encodings::zstd_kernel_prepare; +pub(crate) use encodings::*; +pub(crate) use filter::FilterExecutor; +pub(crate) use slice::SliceExecutor; use crate::CudaKernelEvents; #[cfg(feature = "tracing")] diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index f19489aa3de..cf529f144ec 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -24,27 +24,25 @@ pub use device_buffer::CudaDeviceBuffer; pub use executor::CudaExecutionCtx; pub use executor::CudaKernelEvents; pub use host_to_device_allocator::CopyDeviceReadAt; -pub use kernel::ALPExecutor; -pub use kernel::BitPackedExecutor; -pub use kernel::ConstantNumericExecutor; -pub use kernel::DateTimePartsExecutor; -pub use kernel::DecimalBytePartsExecutor; +use kernel::ALPExecutor; +use kernel::BitPackedExecutor; +use kernel::ConstantNumericExecutor; +use kernel::DateTimePartsExecutor; +use kernel::DecimalBytePartsExecutor; pub use kernel::DefaultLaunchStrategy; -pub use kernel::DictExecutor; -pub use kernel::FilterExecutor; -pub use kernel::FoRExecutor; +use kernel::DictExecutor; +use kernel::FilterExecutor; +use kernel::FoRExecutor; pub use kernel::LaunchStrategy; -pub use kernel::RunEndExecutor; -pub use kernel::SharedExecutor; +use kernel::RunEndExecutor; +use kernel::SharedExecutor; #[cfg(feature = "tracing")] pub use kernel::TracingLaunchStrategy; -pub use kernel::ZigZagExecutor; +use kernel::ZigZagExecutor; #[cfg(feature = "unstable_encodings")] use kernel::ZstdBuffersExecutor; use kernel::ZstdExecutor; pub use kernel::ZstdKernelPrep; -pub use kernel::bitpacked_cuda_kernel; -pub use kernel::bitpacked_cuda_launch_config; pub use kernel::zstd_kernel_prepare; pub use session::CudaSession; pub use session::CudaSessionExt; @@ -67,8 +65,8 @@ use vortex_zigzag::ZigZagVTable; use vortex_zstd::ZstdBuffersVTable; use vortex_zstd::ZstdVTable; -pub use crate::kernel::SequenceExecutor; -pub use crate::kernel::SliceExecutor; +use crate::kernel::SequenceExecutor; +use crate::kernel::SliceExecutor; /// Checks if CUDA is available on the system by looking for nvcc. pub fn cuda_available() -> bool { From 5b9521f0d90a305b3a652d42f55e49e399a16eb1 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 13 Feb 2026 10:04:23 -0500 Subject: [PATCH 03/11] move crate Signed-off-by: Andrew Duffy --- Cargo.lock | 24 +++++++++---------- Cargo.toml | 2 +- .../gpu-scan-cli}/Cargo.toml | 2 +- vortex-cuda/gpu-scan-cli/README.md | 19 +++++++++++++++ .../gpu-scan-cli}/src/main.rs | 0 5 files changed, 33 insertions(+), 14 deletions(-) rename {vortex-test/e2e-cuda-scan => vortex-cuda/gpu-scan-cli}/Cargo.toml (95%) create mode 100644 vortex-cuda/gpu-scan-cli/README.md rename {vortex-test/e2e-cuda-scan => vortex-cuda/gpu-scan-cli}/src/main.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 5034b68ef4e..e78dc7794f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4085,6 +4085,18 @@ dependencies = [ "yansi", ] +[[package]] +name = "gpu-scan-cli" +version = "0.1.0" +dependencies = [ + "futures", + "tokio", + "tracing", + "tracing-subscriber", + "vortex", + "vortex-cuda", +] + [[package]] name = "grid" version = "1.0.0" @@ -10811,18 +10823,6 @@ dependencies = [ "vortex-cuda", ] -[[package]] -name = "vortex-test-e2e-cuda-scan" -version = "0.1.0" -dependencies = [ - "futures", - "tokio", - "tracing", - "tracing-subscriber", - "vortex", - "vortex-cuda", -] - [[package]] name = "vortex-tui" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 349af16c976..73a3cefcc28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ members = [ "vortex-duckdb", "vortex-cuda", "vortex-cuda/cub", + "vortex-cuda/gpu-scan-cli", "vortex-cuda/macros", "vortex-cuda/nvcomp", "vortex-cxx", @@ -36,7 +37,6 @@ members = [ "vortex-tui", "vortex-test/e2e", "vortex-test/e2e-cuda", - "vortex-test/e2e-cuda-scan", "xtask", # Encodings "encodings/fastlanes", diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml similarity index 95% rename from vortex-test/e2e-cuda-scan/Cargo.toml rename to vortex-cuda/gpu-scan-cli/Cargo.toml index 2e7d53e2f75..1ed2dd3e1a4 100644 --- a/vortex-test/e2e-cuda-scan/Cargo.toml +++ b/vortex-cuda/gpu-scan-cli/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "vortex-test-e2e-cuda-scan" +name = "gpu-scan-cli" authors = { workspace = true } description = "CUDA scan testing" edition = { workspace = true } diff --git a/vortex-cuda/gpu-scan-cli/README.md b/vortex-cuda/gpu-scan-cli/README.md new file mode 100644 index 00000000000..90210dc3ed8 --- /dev/null +++ b/vortex-cuda/gpu-scan-cli/README.md @@ -0,0 +1,19 @@ +# gpu-scan-cli + +A CLI tool for benchmarking CUDA-accelerated scans of Vortex files. + +## What it does + +1. Reads a Vortex file from disk +2. Recompresses it using only GPU-compatible encodings +3. Executes a full scan on the GPU via CUDA +4. Outputs tracing information about kernel execution times + +## Usage + +```bash +FLAT_LAYOUT_INLINE_ARRAY_NODE=true RUST_LOG=vortex_cuda=trace,info \ + cargo run --release --bin gpu-scan-cli -- ./path/to/file.vortex +``` + +Use `--json` for JSON-formatted trace output. diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs similarity index 100% rename from vortex-test/e2e-cuda-scan/src/main.rs rename to vortex-cuda/gpu-scan-cli/src/main.rs From 0d78b9e1d198f3cd353ef73f7e21929c24be7495 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 13 Feb 2026 10:09:24 -0500 Subject: [PATCH 04/11] lockfiles Signed-off-by: Andrew Duffy --- vortex-btrblocks/public-api.lock | 2 ++ vortex/public-api.lock | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 30aa578343e..bc5af615759 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -222,6 +222,8 @@ impl vortex_btrblocks::BtrBlocksCompressorBuilder pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self + pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self diff --git a/vortex/public-api.lock b/vortex/public-api.lock index 6a7a137a0be..cba20f93640 100644 --- a/vortex/public-api.lock +++ b/vortex/public-api.lock @@ -18,6 +18,12 @@ pub use vortex::compressor::BtrBlocksCompressor pub use vortex::compressor::BtrBlocksCompressorBuilder +pub use vortex::compressor::FloatCode + +pub use vortex::compressor::IntCode + +pub use vortex::compressor::StringCode + pub mod vortex::compute2 pub use vortex::compute2::<> From b2832604d19bfcdd3d115056123b67c91b496827 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 13 Feb 2026 11:49:37 -0500 Subject: [PATCH 05/11] remove unused dep Signed-off-by: Andrew Duffy --- Cargo.lock | 1 - vortex-cuda/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e78dc7794f2..8bf2f0c234f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10113,7 +10113,6 @@ dependencies = [ "arrow-schema", "async-trait", "bindgen", - "bytes", "codspeed-criterion-compat-walltime", "cudarc", "fastlanes", diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index 197890664d5..b6559fdc085 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -27,7 +27,6 @@ arc-swap = { workspace = true } arrow-data = { workspace = true, features = ["ffi"] } arrow-schema = { workspace = true, features = ["ffi"] } async-trait = { workspace = true } -bytes = { workspace = true } cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } From 4b1ef7f3fe79cb1c8e2a960983ff7faf88f6aa2b Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sun, 15 Feb 2026 14:37:57 -0500 Subject: [PATCH 06/11] don't build CLI on windows Signed-off-by: Andrew Duffy --- vortex-cuda/gpu-scan-cli/Cargo.toml | 5 ++++- vortex-cuda/gpu-scan-cli/src/main.rs | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml index 1ed2dd3e1a4..feb1349f22a 100644 --- a/vortex-cuda/gpu-scan-cli/Cargo.toml +++ b/vortex-cuda/gpu-scan-cli/Cargo.toml @@ -15,10 +15,13 @@ version = { workspace = true } [lints] workspace = true +[features] +cuda = ["dep:vortex-cuda"] + [dependencies] futures = { workspace = true, features = ["executor"] } tokio = { workspace = true, features = ["macros", "full"] } tracing = { workspace = true, features = ["std", "attributes"] } tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } vortex = { workspace = true } -vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"] } +vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"], optional = true } diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs index b82af917193..8cbec8b8819 100644 --- a/vortex-cuda/gpu-scan-cli/src/main.rs +++ b/vortex-cuda/gpu-scan-cli/src/main.rs @@ -1,9 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +#![allow(unused_imports)] + use std::env::args; use std::path::Path; use std::path::PathBuf; +use std::process::exit; use std::sync::Arc; use futures::StreamExt; @@ -25,12 +28,24 @@ use vortex::file::OpenOptionsSessionExt; use vortex::file::WriteOptionsSessionExt; use vortex::file::WriteStrategyBuilder; use vortex::session::VortexSession; +#[cfg(feature = "cuda")] use vortex_cuda::CopyDeviceReadAt; +#[cfg(feature = "cuda")] use vortex_cuda::CudaSession; +#[cfg(feature = "cuda")] use vortex_cuda::TracingLaunchStrategy; +#[cfg(feature = "cuda")] use vortex_cuda::VortexCudaStreamPool; +#[cfg(feature = "cuda")] use vortex_cuda::executor::CudaArrayExt; +#[cfg(not(feature = "cuda"))] +pub fn main() { + eprintln!("this CLI requires being built with the `cuda` feature enabled"); + exit(1); +} + +#[cfg(feature = "cuda")] #[tokio::main] pub async fn main() -> VortexResult<()> { let args: Vec = args().collect(); @@ -120,6 +135,7 @@ pub async fn main() -> VortexResult<()> { /// Recompress the input file using only GPU-executable encodings, returning the file as an /// in-memory byte array. +#[cfg(feature = "cuda")] async fn recompress_for_gpu( input_path: impl AsRef, session: &VortexSession, From 5dfe050235450da49611c18d846a9de669440ad9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sun, 15 Feb 2026 14:52:23 -0500 Subject: [PATCH 07/11] lint Signed-off-by: Andrew Duffy --- vortex-cuda/gpu-scan-cli/Cargo.toml | 5 ++++- vortex-cuda/gpu-scan-cli/src/main.rs | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vortex-cuda/gpu-scan-cli/Cargo.toml b/vortex-cuda/gpu-scan-cli/Cargo.toml index feb1349f22a..44f5db7aa0e 100644 --- a/vortex-cuda/gpu-scan-cli/Cargo.toml +++ b/vortex-cuda/gpu-scan-cli/Cargo.toml @@ -24,4 +24,7 @@ tokio = { workspace = true, features = ["macros", "full"] } tracing = { workspace = true, features = ["std", "attributes"] } tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } vortex = { workspace = true } -vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"], optional = true } +vortex-cuda = { workspace = true, features = [ + "_test-harness", + "tracing", +], optional = true } diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs index 8cbec8b8819..f878d87eb72 100644 --- a/vortex-cuda/gpu-scan-cli/src/main.rs +++ b/vortex-cuda/gpu-scan-cli/src/main.rs @@ -6,7 +6,6 @@ use std::env::args; use std::path::Path; use std::path::PathBuf; -use std::process::exit; use std::sync::Arc; use futures::StreamExt; @@ -40,14 +39,15 @@ use vortex_cuda::VortexCudaStreamPool; use vortex_cuda::executor::CudaArrayExt; #[cfg(not(feature = "cuda"))] -pub fn main() { +#[allow(clippy::exit)] +fn main() { eprintln!("this CLI requires being built with the `cuda` feature enabled"); - exit(1); + std::process::exit(1); } #[cfg(feature = "cuda")] #[tokio::main] -pub async fn main() -> VortexResult<()> { +async fn main() -> VortexResult<()> { let args: Vec = args().collect(); let json_output = args.iter().any(|arg| arg == "--json"); From 9b8320f18d60e68efa07a765435f6f1dd52eaf1a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sun, 15 Feb 2026 15:10:11 -0500 Subject: [PATCH 08/11] skip Signed-off-by: Andrew Duffy --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a57060fd010..73131f0771e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -615,7 +615,7 @@ jobs: --exclude vortex-cub --exclude vortex-test-e2e-cuda --exclude duckdb-bench ` --exclude lance-bench --exclude datafusion-bench --exclude random-access-bench ` --exclude compress-bench --exclude xtask --exclude vortex-datafusion ` - --exclude vortex-sqllogictest + --exclude gpu-scan-cli --exclude vortex-sqllogictest - name: Rust Tests (Other) if: matrix.os != 'windows-x64' run: | From 2fcc7b7b97d1f38acc6e5d8e269f293e6ebcef04 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Feb 2026 16:16:50 -0500 Subject: [PATCH 09/11] measure scans Signed-off-by: Andrew Duffy fixup Signed-off-by: Andrew Duffy --- Cargo.lock | 13 ++ Cargo.toml | 1 + vortex-cuda/Cargo.toml | 1 + vortex-test/e2e-cuda-scan/Cargo.toml | 24 ++++ vortex-test/e2e-cuda-scan/src/main.rs | 177 ++++++++++++++++++++++++++ 5 files changed, 216 insertions(+) create mode 100644 vortex-test/e2e-cuda-scan/Cargo.toml create mode 100644 vortex-test/e2e-cuda-scan/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 8bf2f0c234f..4699d403ecb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10113,6 +10113,7 @@ dependencies = [ "arrow-schema", "async-trait", "bindgen", + "bytes", "codspeed-criterion-compat-walltime", "cudarc", "fastlanes", @@ -10822,6 +10823,18 @@ dependencies = [ "vortex-cuda", ] +[[package]] +name = "vortex-test-e2e-cuda-scan" +version = "0.1.0" +dependencies = [ + "futures", + "tokio", + "tracing", + "tracing-subscriber", + "vortex", + "vortex-cuda", +] + [[package]] name = "vortex-tui" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 73a3cefcc28..ef0f4a2d58d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "vortex-tui", "vortex-test/e2e", "vortex-test/e2e-cuda", + "vortex-test/e2e-cuda-scan", "xtask", # Encodings "encodings/fastlanes", diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index b6559fdc085..197890664d5 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -27,6 +27,7 @@ arc-swap = { workspace = true } arrow-data = { workspace = true, features = ["ffi"] } arrow-schema = { workspace = true, features = ["ffi"] } async-trait = { workspace = true } +bytes = { workspace = true } cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-test/e2e-cuda-scan/Cargo.toml new file mode 100644 index 00000000000..2e7d53e2f75 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "vortex-test-e2e-cuda-scan" +authors = { workspace = true } +description = "CUDA scan testing" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +publish = false +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +futures = { workspace = true, features = ["executor"] } +tokio = { workspace = true, features = ["macros", "full"] } +tracing = { workspace = true, features = ["std", "attributes"] } +tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } +vortex = { workspace = true } +vortex-cuda = { workspace = true, features = ["_test-harness", "tracing"] } diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-test/e2e-cuda-scan/src/main.rs new file mode 100644 index 00000000000..b82af917193 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/src/main.rs @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::env::args; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; + +use futures::StreamExt; +use tracing::Instrument; +use tracing_subscriber::EnvFilter; +use tracing_subscriber::fmt::format::FmtSpan; +use vortex::VortexSessionDefault; +use vortex::array::ToCanonical; +use vortex::array::arrays::DictVTable; +use vortex::buffer::ByteBuffer; +use vortex::buffer::ByteBufferMut; +use vortex::compressor::BtrBlocksCompressorBuilder; +use vortex::compressor::FloatCode; +use vortex::compressor::IntCode; +use vortex::compressor::StringCode; +use vortex::error::VortexResult; +use vortex::file::Footer; +use vortex::file::OpenOptionsSessionExt; +use vortex::file::WriteOptionsSessionExt; +use vortex::file::WriteStrategyBuilder; +use vortex::session::VortexSession; +use vortex_cuda::CopyDeviceReadAt; +use vortex_cuda::CudaSession; +use vortex_cuda::TracingLaunchStrategy; +use vortex_cuda::VortexCudaStreamPool; +use vortex_cuda::executor::CudaArrayExt; + +#[tokio::main] +pub async fn main() -> VortexResult<()> { + let args: Vec = args().collect(); + let json_output = args.iter().any(|arg| arg == "--json"); + + if json_output { + tracing_subscriber::fmt() + .json() + .with_env_filter(EnvFilter::from_default_env()) + .with_span_events(FmtSpan::NONE) + .with_ansi(false) + .init(); + } else { + tracing_subscriber::fmt() + .pretty() + .with_env_filter(EnvFilter::from_default_env()) + .with_span_events(FmtSpan::NONE) + .with_ansi(false) + .event_format(tracing_subscriber::fmt::format().with_target(true)) + .init(); + } + + let session = VortexSession::default(); + let mut cuda_ctx = CudaSession::create_execution_ctx(&session)? + .with_launch_strategy(Arc::new(TracingLaunchStrategy)); + + #[allow(clippy::expect_used, clippy::unwrap_in_result)] + let input_path = args + .iter() + .skip(1) + .find(|arg| !arg.starts_with("--")) + .expect("must provide path to .vortex file"); + let input_path = PathBuf::from(input_path); + + assert!(input_path.exists(), "input path does not exist"); + + let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?; + + // Create a full scan that executes on the GPU + let cuda_stream = + VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?; + let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream); + + let gpu_file = session + .open_options() + .with_footer(footer) + .open(Arc::new(gpu_reader)) + .await?; + + // execute_micros => µs to execute + let mut batches = gpu_file.scan()?.into_array_stream()?; + + let mut chunk = 0; + while let Some(next) = batches.next().await.transpose()? { + let record = next.to_struct(); + + for (field, field_name) in record + .unmasked_fields() + .iter() + .zip(record.struct_fields().names().iter()) + { + let field_name = field_name.to_string(); + // skip dict, varbin isn't properly implemented. + if field.is::() { + continue; + } + + let span = + tracing::info_span!("array execution", chunk = chunk, field_name = field_name); + + async { + if field.clone().execute_cuda(&mut cuda_ctx).await.is_err() { + tracing::error!("failed to execute_cuda on column"); + } + } + .instrument(span) + .await; + } + + chunk += 1; + } + + Ok(()) +} + +// Dump the values out as a new Vortex file for analysis. + +/// Recompress the input file using only GPU-executable encodings, returning the file as an +/// in-memory byte array. +async fn recompress_for_gpu( + input_path: impl AsRef, + session: &VortexSession, +) -> VortexResult<(ByteBuffer, Footer)> { + // Setup the reader + let input = session.open_options().open_path(input_path).await?; + + // Build a scan to read all columns from the input, and recompress them using only GPU-compatible + // encodings. + let scan = input.scan()?.into_array_stream()?; + + // Rebuild a copy of the file that only uses GPU-compatible compression algorithms. + let compressor = BtrBlocksCompressorBuilder::empty() + .include_int([ + IntCode::Uncompressed, + IntCode::Constant, + IntCode::BitPacking, + IntCode::For, + IntCode::Sequence, + IntCode::ZigZag, + IntCode::Dict, + ]) + .include_float([ + FloatCode::Uncompressed, + FloatCode::Constant, + FloatCode::Alp, + FloatCode::AlpRd, + FloatCode::RunEnd, + ]) + // Don't compress strings, this is b/c we don't have any BtrBlocks encodings that support + // strings. + .include_string([ + StringCode::Uncompressed, + StringCode::Constant, + StringCode::Dict, + StringCode::Zstd, + StringCode::ZstdBuffers, + ]) + .build(); + + // Read an input stream from a Vortex file. + let writer = WriteStrategyBuilder::default() + .with_compressor(compressor) + .build(); + + // Segment sink? + let mut out = ByteBufferMut::empty(); + let result = session + .write_options() + .with_strategy(writer) + .write(&mut out, scan) + .await?; + + Ok((out.freeze(), result.footer().clone())) +} From 2435ef107fd99da7aba4ccaaf2680d172eef14ab Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 12 Feb 2026 16:12:10 -0500 Subject: [PATCH 10/11] fused FOR + BP Signed-off-by: Andrew Duffy --- vortex-cuda/cuda_kernel_generator/mod.rs | 20 +- vortex-cuda/kernels/src/bit_unpack_16.cu | 749 +- vortex-cuda/kernels/src/bit_unpack_32.cu | 2447 +++-- vortex-cuda/kernels/src/bit_unpack_64.cu | 8975 ++++++++--------- vortex-cuda/kernels/src/bit_unpack_8.cu | 289 +- vortex-cuda/src/kernel/encodings/bitpacked.rs | 5 +- vortex-cuda/src/kernel/encodings/for_.rs | 30 +- 7 files changed, 6268 insertions(+), 6247 deletions(-) diff --git a/vortex-cuda/cuda_kernel_generator/mod.rs b/vortex-cuda/cuda_kernel_generator/mod.rs index 19672737cd7..563071b196b 100644 --- a/vortex-cuda/cuda_kernel_generator/mod.rs +++ b/vortex-cuda/cuda_kernel_generator/mod.rs @@ -20,23 +20,22 @@ fn generate_lane_decoder( writeln!( output, - "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{" + "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, const uint{bits}_t reference, unsigned int lane) {{" )?; output.indent(|output| { writeln!(output, "unsigned int LANE_COUNT = {lanes};")?; if bit_width == 0 { - writeln!(output, "uint{bits}_t zero = 0ULL;")?; writeln!(output)?; for row in 0..bits { - writeln!(output, "out[INDEX({row}, lane)] = zero;")?; + writeln!(output, "out[INDEX({row}, lane)] = reference;")?; } } else if bit_width == bits { writeln!(output)?; for row in 0..bits { writeln!( output, - "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];", + "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane] + reference;", )?; } } else { @@ -72,7 +71,7 @@ fn generate_lane_decoder( )?; } - writeln!(output, "out[INDEX({row}, lane)] = tmp;")?; + writeln!(output, "out[INDEX({row}, lane)] = tmp + reference;")?; } } Ok(()) @@ -129,7 +128,7 @@ fn generate_device_kernel_for_width( let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t"); let local_func_params = format!( - "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)" + "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, const uint{bits}_t reference, int thread_idx)" ); writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?; @@ -138,7 +137,7 @@ fn generate_device_kernel_for_width( writeln!(output, "__shared__ uint{bits}_t shared_out[1024];")?; for thread_lane in 0..per_thread_loop_count { - writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, shared_out, thread_idx * {per_thread_loop_count} + {thread_lane});")?; + writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, shared_out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?; } writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?; @@ -160,8 +159,9 @@ fn generate_global_kernel_for_width( let bits = ::T; let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t"); - let func_params = - format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)"); + let func_params = format!( + "(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, const uint{bits}_t reference)" + ); writeln!( output, @@ -176,7 +176,7 @@ fn generate_global_kernel_for_width( )?; writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?; - writeln!(output, "_{func_name}(in, out, thread_idx);") + writeln!(output, "_{func_name}(in, out, reference, thread_idx);") })?; writeln!(output, "}}") diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu index f4867a03294..3ce273bd0a5 100644 --- a/vortex-cuda/kernels/src/bit_unpack_16.cu +++ b/vortex-cuda/kernels/src/bit_unpack_16.cu @@ -4,857 +4,856 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_16_0bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_0bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; - uint16_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; } -__device__ void _bit_unpack_16_1bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_1bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_2bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_2bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_3bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_3bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 1)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_4bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_4bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_5bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_5bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 4)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 2)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_6bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_6bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_7bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_7bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 5)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 1)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 2)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_8bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_8bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_9bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_9bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 6)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 1)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 3)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 5)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_10bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_10bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 4)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 8)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 2)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 0)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 4)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 6)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_11bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_11bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 6)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 1)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 7)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 8)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 3)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 9)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 10)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 5)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_12bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_12bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_13bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_13bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 10)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 7)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 4)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 11)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 8)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 5)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 12)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 9)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 6)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_14bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_14bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 12)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 10)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 8)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 2)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 0)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 10)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 8)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 6)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint16_t, 2)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_15bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_15bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 14)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 13)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 12)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 10)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 9)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 6)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 5)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 4)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint16_t, 2)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint16_t, 1)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_16bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_16bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; } /// Runtime dispatch to the optimized lane decoder for the given bit width. @@ -885,292 +884,292 @@ __device__ inline void bit_unpack_16_lane( } } -__device__ void _bit_unpack_16_0bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_0bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_0bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_0bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_0bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_0bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_0bw_32t(in, out, thread_idx); + _bit_unpack_16_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_1bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_1bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_1bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_1bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_1bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_1bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_1bw_32t(in, out, thread_idx); + _bit_unpack_16_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_2bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_2bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_2bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_2bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_2bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_2bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_2bw_32t(in, out, thread_idx); + _bit_unpack_16_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_3bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_3bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_3bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_3bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_3bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_3bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_3bw_32t(in, out, thread_idx); + _bit_unpack_16_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_4bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_4bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_4bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_4bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_4bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_4bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_4bw_32t(in, out, thread_idx); + _bit_unpack_16_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_5bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_5bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_5bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_5bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_5bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_5bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_5bw_32t(in, out, thread_idx); + _bit_unpack_16_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_6bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_6bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_6bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_6bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_6bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_6bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_6bw_32t(in, out, thread_idx); + _bit_unpack_16_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_7bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_7bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_7bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_7bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_7bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_7bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_7bw_32t(in, out, thread_idx); + _bit_unpack_16_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_8bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_8bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_8bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_8bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_8bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_8bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_8bw_32t(in, out, thread_idx); + _bit_unpack_16_8bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_9bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_9bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_9bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_9bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_9bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_9bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_9bw_32t(in, out, thread_idx); + _bit_unpack_16_9bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_10bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_10bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_10bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_10bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_10bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_10bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_10bw_32t(in, out, thread_idx); + _bit_unpack_16_10bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_11bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_11bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_11bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_11bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_11bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_11bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_11bw_32t(in, out, thread_idx); + _bit_unpack_16_11bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_12bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_12bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_12bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_12bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_12bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_12bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_12bw_32t(in, out, thread_idx); + _bit_unpack_16_12bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_13bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_13bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_13bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_13bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_13bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_13bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_13bw_32t(in, out, thread_idx); + _bit_unpack_16_13bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_14bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_14bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_14bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_14bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_14bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_14bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_14bw_32t(in, out, thread_idx); + _bit_unpack_16_14bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_15bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_15bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_15bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_15bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_15bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_15bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_15bw_32t(in, out, thread_idx); + _bit_unpack_16_15bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_16bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_16_16bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, int thread_idx) { __shared__ uint16_t shared_out[1024]; - _bit_unpack_16_16bw_lane(in, shared_out, thread_idx * 2 + 0); - _bit_unpack_16_16bw_lane(in, shared_out, thread_idx * 2 + 1); + _bit_unpack_16_16bw_lane(in, shared_out, reference, thread_idx * 2 + 0); + _bit_unpack_16_16bw_lane(in, shared_out, reference, thread_idx * 2 + 1); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, const uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_16bw_32t(in, out, thread_idx); + _bit_unpack_16_16bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_32.cu b/vortex-cuda/kernels/src/bit_unpack_32.cu index 91dd1c2e317..bfab7414d60 100644 --- a/vortex-cuda/kernels/src/bit_unpack_32.cu +++ b/vortex-cuda/kernels/src/bit_unpack_32.cu @@ -4,3241 +4,3240 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_32_0bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_0bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; - uint32_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; - out[INDEX(16, lane)] = zero; - out[INDEX(17, lane)] = zero; - out[INDEX(18, lane)] = zero; - out[INDEX(19, lane)] = zero; - out[INDEX(20, lane)] = zero; - out[INDEX(21, lane)] = zero; - out[INDEX(22, lane)] = zero; - out[INDEX(23, lane)] = zero; - out[INDEX(24, lane)] = zero; - out[INDEX(25, lane)] = zero; - out[INDEX(26, lane)] = zero; - out[INDEX(27, lane)] = zero; - out[INDEX(28, lane)] = zero; - out[INDEX(29, lane)] = zero; - out[INDEX(30, lane)] = zero; - out[INDEX(31, lane)] = zero; -} - -__device__ void _bit_unpack_32_1bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; + out[INDEX(16, lane)] = reference; + out[INDEX(17, lane)] = reference; + out[INDEX(18, lane)] = reference; + out[INDEX(19, lane)] = reference; + out[INDEX(20, lane)] = reference; + out[INDEX(21, lane)] = reference; + out[INDEX(22, lane)] = reference; + out[INDEX(23, lane)] = reference; + out[INDEX(24, lane)] = reference; + out[INDEX(25, lane)] = reference; + out[INDEX(26, lane)] = reference; + out[INDEX(27, lane)] = reference; + out[INDEX(28, lane)] = reference; + out[INDEX(29, lane)] = reference; + out[INDEX(30, lane)] = reference; + out[INDEX(31, lane)] = reference; +} + +__device__ void _bit_unpack_32_1bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 1); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 1); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 1); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 1); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 1); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 1); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 1); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 1); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 1); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 1); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 1); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 1); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 1); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 1); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 1); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_2bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_2bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 2; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 2); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 2); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 2); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 2); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 2); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 2); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 2); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 2); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 2); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 2); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 2); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 2); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 2); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 2); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 2); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_3bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_3bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 3); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 1)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 3); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 3); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 3); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 3); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 3); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 1; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 3); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 3); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 3); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 3); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 3); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 3); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 3); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 3); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 3); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_4bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_4bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_5bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_5bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 5); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 5); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 5); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 5); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 5); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 1; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 5); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 5); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 5); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 5); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 5); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 3; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 5); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 5); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 5); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 5); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 5); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_6bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_6bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 6); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 6); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 6; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 6); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 6); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 6); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 6); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 6); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 2; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 6); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 6); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 6); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 6); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 6); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 6); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 6); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 6); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_7bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_7bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 7); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 7); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 7); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 2)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 7); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 7); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 5)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 7); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 7); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 7); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 1)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 7); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 7); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 7); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 7); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 3; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 7); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 7); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 7); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_8bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_8bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_9bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_9bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 9); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 9); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 9); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 3)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 9); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 9); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 7; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 9); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 9); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 9); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 3; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 9); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 9); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 9); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 9); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 9); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 5)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 9); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 9); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_10bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_10bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 10); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 8)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 10); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 10); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 10); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 10); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 10; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 10); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 10); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 10); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 2; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 10); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 10); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 6)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 10); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 10); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 6; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 10); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 10); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 2)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 10); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 10); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_11bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_11bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 11); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 1)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 11); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 11); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 3)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 11); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 11); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 5)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 11); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 5; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 11); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 11); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 7)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 11); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 11); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 3; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 11); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 11); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 9)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 11); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 11); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 1; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 11); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_12bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_12bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_13bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_13bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 13); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 7)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 13); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 13); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 8)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 13); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 13); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 9)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 13); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 13); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 3; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 13); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 9; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 13); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 13); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 11)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 13); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 5)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 13); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 13); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 1; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 13); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 7; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 13); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_14bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_14bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 14); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 10)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 14); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 14); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 2)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 14); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 14); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 14); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 14); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 14; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 14); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 14); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 10)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 14); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 6)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 14); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 2)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 14); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 14); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 2; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 14); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 8)) << 6; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 14); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 10; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 14); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_15bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_15bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 15); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 13)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 15); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 15); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 9)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 15); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 15); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 5)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 15); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 15); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 15); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 1; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 15); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 3; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 15); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 5; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 15); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 7; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 15); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 9; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 15); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 11; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 15); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 13; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 15); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_16bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_16bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_17bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_17bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 17); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 2)) << 15; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 17); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 4)) << 13; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 17); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 6)) << 11; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 17); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 8)) << 9; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 17); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 10)) << 7; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 17); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 12)) << 5; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 17); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 14)) << 3; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 17); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 16)) << 1; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 1)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 17); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 3)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 17); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 5)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 17); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 7)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 17); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 9)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 17); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 11)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 17); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 13)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 17); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 15)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_18bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_18bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 18); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 14; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 18); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 10; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 18); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 12)) << 6; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 18); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 16)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 18); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 18); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 18); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 18; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 18); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 14; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 18); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 10; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 18); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 12)) << 6; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 18); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 2; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 18); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 6)) << 12; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 18); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 10)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 18); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 14)) << 4; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_19bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_19bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 19); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 6)) << 13; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 19); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 12)) << 7; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 19); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 18)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 5)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 19); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 11)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 19); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 17)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 4)) << 15; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 19); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 10)) << 9; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 19); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 16)) << 3; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 3)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 19); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 9)) << 10; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 19); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 15)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 2)) << 17; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 19); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 8)) << 11; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 19); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 14)) << 5; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 1)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 19); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 7)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 19); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 13)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_20bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_20bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_21bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_21bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 21); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 10)) << 11; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 21); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 20)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 9)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 21); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 19)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 13; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 21); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 18)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 7)) << 14; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 21); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 17)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 6)) << 15; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 21); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 5; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 5)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 21); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 15)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 17; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 21); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 14)) << 7; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 3)) << 18; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 21); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 13)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 2)) << 19; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 21); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 12)) << 9; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 1)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 21); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 11)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_22bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_22bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 22); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 12)) << 10; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 22); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 14)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 18; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 22); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 16)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 22); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 18)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 14; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 22); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 20)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 0)) << 22; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 22); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 12)) << 10; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 2)) << 20; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 22); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 14)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 4)) << 18; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 22); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 16)) << 6; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 6)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 22); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 18)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 8)) << 14; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 22); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 20)) << 2; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 10)) << 12; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_23bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_23bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 23); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 14)) << 9; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 5)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 23); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 19)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 10)) << 13; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 1)) << 22; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 23); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 15)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 6)) << 17; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 23); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 20)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 11)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 2)) << 21; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 23); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 16)) << 7; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 7)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 23); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 21)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 12)) << 11; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 3)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 23); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 17)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 8)) << 15; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 23); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 22)) << 1; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 13)) << 10; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 4)) << 19; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 23); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 18)) << 5; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 9)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_24bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_24bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_25bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_25bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 25); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 18)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 11)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 21; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 25); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 22)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 15)) << 10; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 17; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 25); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 19)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 13; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 5)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 25); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 23)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 16)) << 9; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 9)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 23; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 25); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 20)) << 5; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 13)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 6)) << 19; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 25); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 24)) << 1; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 17)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 10)) << 15; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 3)) << 22; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 25); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 21)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 14)) << 11; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 7)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_26bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_26bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 26); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 20)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 14)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 8)) << 18; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 26); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 22)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 16)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 22; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 26); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 24)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 18)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 14; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 0)) << 26; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 26); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 20)) << 6; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 14)) << 12; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 8)) << 18; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 2)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 26); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 22)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 16)) << 10; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 10)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 4)) << 22; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 26); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 24)) << 2; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 18)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 12)) << 14; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 6)) << 20; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_27bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_27bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 27); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 22)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 17)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 12)) << 15; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 25; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 27); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 24)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 19)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 13; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 9)) << 18; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 23; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 27); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 26)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 21)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 11; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 11)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 6)) << 21; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 1)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 27); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 23)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 18)) << 9; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 13)) << 14; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 8)) << 19; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 3)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 27); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 25)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 20)) << 7; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 15)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 10)) << 17; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 5)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_28bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_28bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_29bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_29bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 29); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 26)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 23)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 20)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 17)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 14)) << 15; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 11)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 8)) << 21; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 5)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 2)) << 27; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 29); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 28)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 25)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 22)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 19)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 16)) << 13; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 13)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 10)) << 19; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 7)) << 22; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 4)) << 25; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 1)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 29); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 27)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 24)) << 5; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 21)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 18)) << 11; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 15)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 12)) << 17; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 9)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 6)) << 23; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 3)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 29); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_30bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_30bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 30); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 28)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 26)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 24)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 22)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 20)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 18)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 18; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 22; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 26; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 30; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 30); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 28)) << 2; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 26)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 24)) << 6; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 22)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 20)) << 10; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 18)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 16)) << 14; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 14)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 12)) << 18; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 10)) << 20; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 8)) << 22; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 6)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 4)) << 26; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint32_t, 2)) << 28; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_31bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_31bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 31); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 30)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 29)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 28)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 27)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 26)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 25)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 24)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 23)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 22)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 21)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 20)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 19)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 18)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 17)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 16)) << 15; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 15)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 14)) << 17; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 13)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 12)) << 19; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 11)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 10)) << 21; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 9)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 8)) << 23; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 7)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 6)) << 25; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 5)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 4)) << 27; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 3)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 29); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint32_t, 2)) << 29; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint32_t, 1)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 31); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_32bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_32bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; - out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane]; - out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane]; - out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane]; - out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane]; - out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane]; - out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane]; - out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane]; - out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane]; - out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane]; - out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane]; - out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane]; - out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane]; - out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane]; - out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane]; - out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane]; - out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; + out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane] + reference; + out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane] + reference; + out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane] + reference; + out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane] + reference; + out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane] + reference; + out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane] + reference; + out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane] + reference; + out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane] + reference; + out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane] + reference; + out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane] + reference; + out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane] + reference; + out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane] + reference; + out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane] + reference; + out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane] + reference; + out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane] + reference; + out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane] + reference; } /// Runtime dispatch to the optimized lane decoder for the given bit width. @@ -3285,531 +3284,531 @@ __device__ inline void bit_unpack_32_lane( } } -__device__ void _bit_unpack_32_0bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_0bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_0bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_0bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_0bw_32t(in, out, thread_idx); + _bit_unpack_32_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_1bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_1bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_1bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_1bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_1bw_32t(in, out, thread_idx); + _bit_unpack_32_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_2bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_2bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_2bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_2bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_2bw_32t(in, out, thread_idx); + _bit_unpack_32_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_3bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_3bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_3bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_3bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_3bw_32t(in, out, thread_idx); + _bit_unpack_32_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_4bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_4bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_4bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_4bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_4bw_32t(in, out, thread_idx); + _bit_unpack_32_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_5bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_5bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_5bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_5bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_5bw_32t(in, out, thread_idx); + _bit_unpack_32_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_6bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_6bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_6bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_6bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_6bw_32t(in, out, thread_idx); + _bit_unpack_32_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_7bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_7bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_7bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_7bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_7bw_32t(in, out, thread_idx); + _bit_unpack_32_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_8bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_8bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_8bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_8bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_8bw_32t(in, out, thread_idx); + _bit_unpack_32_8bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_9bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_9bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_9bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_9bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_9bw_32t(in, out, thread_idx); + _bit_unpack_32_9bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_10bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_10bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_10bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_10bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_10bw_32t(in, out, thread_idx); + _bit_unpack_32_10bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_11bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_11bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_11bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_11bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_11bw_32t(in, out, thread_idx); + _bit_unpack_32_11bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_12bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_12bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_12bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_12bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_12bw_32t(in, out, thread_idx); + _bit_unpack_32_12bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_13bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_13bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_13bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_13bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_13bw_32t(in, out, thread_idx); + _bit_unpack_32_13bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_14bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_14bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_14bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_14bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_14bw_32t(in, out, thread_idx); + _bit_unpack_32_14bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_15bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_15bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_15bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_15bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_15bw_32t(in, out, thread_idx); + _bit_unpack_32_15bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_16bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_16bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_16bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_16bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_16bw_32t(in, out, thread_idx); + _bit_unpack_32_16bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_17bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_17bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_17bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_17bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_17bw_32t(in, out, thread_idx); + _bit_unpack_32_17bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_18bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_18bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_18bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_18bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_18bw_32t(in, out, thread_idx); + _bit_unpack_32_18bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_19bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_19bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_19bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_19bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_19bw_32t(in, out, thread_idx); + _bit_unpack_32_19bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_20bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_20bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_20bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_20bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_20bw_32t(in, out, thread_idx); + _bit_unpack_32_20bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_21bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_21bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_21bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_21bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_21bw_32t(in, out, thread_idx); + _bit_unpack_32_21bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_22bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_22bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_22bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_22bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_22bw_32t(in, out, thread_idx); + _bit_unpack_32_22bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_23bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_23bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_23bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_23bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_23bw_32t(in, out, thread_idx); + _bit_unpack_32_23bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_24bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_24bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_24bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_24bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_24bw_32t(in, out, thread_idx); + _bit_unpack_32_24bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_25bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_25bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_25bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_25bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_25bw_32t(in, out, thread_idx); + _bit_unpack_32_25bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_26bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_26bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_26bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_26bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_26bw_32t(in, out, thread_idx); + _bit_unpack_32_26bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_27bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_27bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_27bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_27bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_27bw_32t(in, out, thread_idx); + _bit_unpack_32_27bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_28bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_28bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_28bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_28bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_28bw_32t(in, out, thread_idx); + _bit_unpack_32_28bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_29bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_29bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_29bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_29bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_29bw_32t(in, out, thread_idx); + _bit_unpack_32_29bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_30bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_30bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_30bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_30bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_30bw_32t(in, out, thread_idx); + _bit_unpack_32_30bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_31bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_31bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_31bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_31bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_31bw_32t(in, out, thread_idx); + _bit_unpack_32_31bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_32bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_32_32bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, int thread_idx) { __shared__ uint32_t shared_out[1024]; - _bit_unpack_32_32bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_32_32bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, const uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_32bw_32t(in, out, thread_idx); + _bit_unpack_32_32bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_64.cu b/vortex-cuda/kernels/src/bit_unpack_64.cu index 56286b6b50f..20a0ff5b1f7 100644 --- a/vortex-cuda/kernels/src/bit_unpack_64.cu +++ b/vortex-cuda/kernels/src/bit_unpack_64.cu @@ -4,12617 +4,12616 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_64_0bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_0bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; - uint64_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; - out[INDEX(16, lane)] = zero; - out[INDEX(17, lane)] = zero; - out[INDEX(18, lane)] = zero; - out[INDEX(19, lane)] = zero; - out[INDEX(20, lane)] = zero; - out[INDEX(21, lane)] = zero; - out[INDEX(22, lane)] = zero; - out[INDEX(23, lane)] = zero; - out[INDEX(24, lane)] = zero; - out[INDEX(25, lane)] = zero; - out[INDEX(26, lane)] = zero; - out[INDEX(27, lane)] = zero; - out[INDEX(28, lane)] = zero; - out[INDEX(29, lane)] = zero; - out[INDEX(30, lane)] = zero; - out[INDEX(31, lane)] = zero; - out[INDEX(32, lane)] = zero; - out[INDEX(33, lane)] = zero; - out[INDEX(34, lane)] = zero; - out[INDEX(35, lane)] = zero; - out[INDEX(36, lane)] = zero; - out[INDEX(37, lane)] = zero; - out[INDEX(38, lane)] = zero; - out[INDEX(39, lane)] = zero; - out[INDEX(40, lane)] = zero; - out[INDEX(41, lane)] = zero; - out[INDEX(42, lane)] = zero; - out[INDEX(43, lane)] = zero; - out[INDEX(44, lane)] = zero; - out[INDEX(45, lane)] = zero; - out[INDEX(46, lane)] = zero; - out[INDEX(47, lane)] = zero; - out[INDEX(48, lane)] = zero; - out[INDEX(49, lane)] = zero; - out[INDEX(50, lane)] = zero; - out[INDEX(51, lane)] = zero; - out[INDEX(52, lane)] = zero; - out[INDEX(53, lane)] = zero; - out[INDEX(54, lane)] = zero; - out[INDEX(55, lane)] = zero; - out[INDEX(56, lane)] = zero; - out[INDEX(57, lane)] = zero; - out[INDEX(58, lane)] = zero; - out[INDEX(59, lane)] = zero; - out[INDEX(60, lane)] = zero; - out[INDEX(61, lane)] = zero; - out[INDEX(62, lane)] = zero; - out[INDEX(63, lane)] = zero; -} - -__device__ void _bit_unpack_64_1bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; + out[INDEX(16, lane)] = reference; + out[INDEX(17, lane)] = reference; + out[INDEX(18, lane)] = reference; + out[INDEX(19, lane)] = reference; + out[INDEX(20, lane)] = reference; + out[INDEX(21, lane)] = reference; + out[INDEX(22, lane)] = reference; + out[INDEX(23, lane)] = reference; + out[INDEX(24, lane)] = reference; + out[INDEX(25, lane)] = reference; + out[INDEX(26, lane)] = reference; + out[INDEX(27, lane)] = reference; + out[INDEX(28, lane)] = reference; + out[INDEX(29, lane)] = reference; + out[INDEX(30, lane)] = reference; + out[INDEX(31, lane)] = reference; + out[INDEX(32, lane)] = reference; + out[INDEX(33, lane)] = reference; + out[INDEX(34, lane)] = reference; + out[INDEX(35, lane)] = reference; + out[INDEX(36, lane)] = reference; + out[INDEX(37, lane)] = reference; + out[INDEX(38, lane)] = reference; + out[INDEX(39, lane)] = reference; + out[INDEX(40, lane)] = reference; + out[INDEX(41, lane)] = reference; + out[INDEX(42, lane)] = reference; + out[INDEX(43, lane)] = reference; + out[INDEX(44, lane)] = reference; + out[INDEX(45, lane)] = reference; + out[INDEX(46, lane)] = reference; + out[INDEX(47, lane)] = reference; + out[INDEX(48, lane)] = reference; + out[INDEX(49, lane)] = reference; + out[INDEX(50, lane)] = reference; + out[INDEX(51, lane)] = reference; + out[INDEX(52, lane)] = reference; + out[INDEX(53, lane)] = reference; + out[INDEX(54, lane)] = reference; + out[INDEX(55, lane)] = reference; + out[INDEX(56, lane)] = reference; + out[INDEX(57, lane)] = reference; + out[INDEX(58, lane)] = reference; + out[INDEX(59, lane)] = reference; + out[INDEX(60, lane)] = reference; + out[INDEX(61, lane)] = reference; + out[INDEX(62, lane)] = reference; + out[INDEX(63, lane)] = reference; +} + +__device__ void _bit_unpack_64_1bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 1); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 1); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 1); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 1); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 1); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 1); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 1); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 1); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 1); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 1); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 1); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 1); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 1); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 1); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 1); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 1); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 1); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 1); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 1); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 1); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 1); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 1); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 1); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 1); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 1); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 1); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 1); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 1); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 1); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 1); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 1); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 1); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 1); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 1); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 1); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 1); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 1); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 1); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 1); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 1); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 1); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 1); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 1); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 1); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 1); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 1); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 1); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_2bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_2bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 2); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 2); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 2); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 2); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 2); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 2); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 2); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 2); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 2); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 2); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 2); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 2); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 2); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 2); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 2); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 2); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 2; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 2); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 2); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 2); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 2); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 2); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 2); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 2); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 2); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 2); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 2); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 2); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 2); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 2); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 2); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 2); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 2); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 2); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 2); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 2); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 2); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 2); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 2); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 2); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 2); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 2); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 2); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 2); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 2); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 2); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 2); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 2); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_3bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_3bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 3); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 3); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 3); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 3); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 3); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 3); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 3); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 1; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 3); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 3); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 3); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 3); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 3); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 3); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 3); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 3); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 3); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 3); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 3); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 3); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 3); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 3); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 3); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 3); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 3); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 3); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 3); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 3); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 1)) << 2; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 3); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 3); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 3); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 3); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 3); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 3); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 3); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 3); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 3); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 3); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 3); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 3); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 3); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 3); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 3); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 3); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 3); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 3); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 3); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 3); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_4bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_4bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_5bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_5bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 5); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 5); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 5); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 5); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 5); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 5); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 5); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 5); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 5); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 5); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 5); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 5); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 3; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 5); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 5); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 5); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 5); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 5); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 5); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 5); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 5); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 5); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 5); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 5); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 5); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 2; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 5); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 5); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 5); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 5); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 5); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 5); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 5); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 5); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 5); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 5); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 5); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 5); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 1; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 5); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 5); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 5); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 5); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 5); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 5); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 5); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 5); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 5); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 5); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 5); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_6bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_6bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 6); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 6); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 6); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 6); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 6); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 6); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 6); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 6); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 6); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 2; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 6); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 6); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 6); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 6); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 6); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 6); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 6); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 6); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 6); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 6; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 6); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 6); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 6); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 6); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 6); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 6); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 6); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 6); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 6); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 6); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 2)) << 4; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 6); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 6); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 6); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 6); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 6); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 6); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 6); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 6); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 6); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 6); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 2; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 6); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 6); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 6); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 6); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 6); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 6); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 6); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 6); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 6); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_7bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_7bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 7); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 7); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 7); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 7); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 7); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 7); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 7); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 5)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 7); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 7); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 7); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 7); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 7); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 7); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 7); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 7); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 3; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 7); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 7); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 7); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 7); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 7); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 7); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 7); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 7); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 3)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 7); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 7); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 7); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 7); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 7); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 7); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 7); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 7); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 5; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 7); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 7); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 7); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 7); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 7); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 7); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 7); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 7); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 6; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 7); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 7); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 7); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 7); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 7); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 7); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 7); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 7); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_8bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_8bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_9bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_9bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 9); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 9); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 9); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 9); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 9); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 9); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 9); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 9); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 9); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 9); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 9); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 3; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 9); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 9); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 9); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 9); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 9); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 9); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 5)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 9); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 9); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 9); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 9); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 9); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 9); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 5; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 9); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 9); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 9); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 9); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 9); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 9); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 3)) << 6; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 9); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 9); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 9); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 9); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 9); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 9); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 7; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 9); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 9); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 9); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 9); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 9); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 9); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 9); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 9); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 9); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 9); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 9); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 9); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_10bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_10bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 10); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 10); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 10); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 10); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 10); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 10); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 10); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 10); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 10); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 10); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 10); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 2; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 10); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 10); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 10); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 10); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 10); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 6; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 10); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 10); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 10); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 10); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 10); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 10; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 10); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 10); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 10); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 10); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 10); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 10); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 4; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 10); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 10); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 10); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 10); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 10); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 10); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 10); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 10); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 10); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 10); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 10); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 2; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 10); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 10); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 10); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 10); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 10); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 6; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 10); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 10); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 10); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 10); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 10); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_11bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_11bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 11); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 11); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 11); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 11); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 11); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 11); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 11); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 11); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 11); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 5; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 11); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 11); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 11); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 11); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 11); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 3; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 11); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 11); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 11); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 11); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 11); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 1; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 11); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 11); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 11); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 11); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 10; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 11); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 11); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 11); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 11); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 11); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 3)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 11); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 11); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 11); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 11); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 11); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 5)) << 6; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 11); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 11); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 11); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 11); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 11); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 7)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 11); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 11); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 11); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 11); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 11); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 9)) << 2; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 11); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 11); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 11); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 11); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_12bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_12bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_13bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_13bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 13); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 13); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 13); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 13); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 13); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 13); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 13); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 13); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 13); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 13); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 13); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 13); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 9; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 13); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 13); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 13); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 13); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 5)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 13); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 13); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 13); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 13); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 7; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 13); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 13); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 13); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 13); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 7)) << 6; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 13); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 13); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 13); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 13); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 5; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 13); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 13); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 13); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 13); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 13); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 13); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 13); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 13); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 3; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 13); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 13); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 13); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 13); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 11)) << 2; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 13); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 13); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 13); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 13); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 1; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 13); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 13); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 13); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_14bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_14bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 14); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 14); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 14); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 14); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 14); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 14); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 14); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 14); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 14); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 14); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 14); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 14); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 14); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 14); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 14); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 14); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 14); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 14); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 14); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 14); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 6; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 14); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 14); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 14); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 14; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 14); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 14); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 14); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 14); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 6)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 14); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 14); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 14); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 14); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 2; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 14); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 14); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 14); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 10; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 14); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 14); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 14); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 14); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 4; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 14); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 14); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 14); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 12; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 14); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 14); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 14); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 14); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 6; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 14); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 14); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 14); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_15bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_15bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 15); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 15); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 15); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 15); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 15); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 15); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 15); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 15); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 15); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 15); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 15); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 15); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 1; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 15); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 15); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 15); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 5; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 15); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 15); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 15); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 9; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 15); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 15); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 15); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 13; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 15); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 15); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 15); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 15); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 2; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 15); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 15); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 15); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 6; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 15); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 15); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 15); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 5)) << 10; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 15); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 15); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 15); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 1)) << 14; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 15); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 15); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 15); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 15); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 3; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 15); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 15); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 15); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 7; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 15); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 15); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 15); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 11; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 15); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 15); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 15); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_16bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_16bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_17bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_17bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 17); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 17); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 17); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 4)) << 13; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 17); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 17); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 17); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 8)) << 9; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 17); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 17); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 17); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 5; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 17); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 17); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 17); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 16)) << 1; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 17); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 17); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 3)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 17); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 17); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 17); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 7)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 17); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 17); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 17); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 17); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 17); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 17); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 15)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 17); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 17); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 2)) << 15; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 17); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 17); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 17); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 11; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 17); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 17); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 17); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 7; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 17); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 17); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 17); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 3; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 17); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 17); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 1)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 17); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 17); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 17); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 5)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 17); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 17); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 17); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 17); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 17); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 17); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 13)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 17); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 17); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_18bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_18bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 18); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 18); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 18); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 10; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 18); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 18); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 18); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 18); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 18); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 18); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 18); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 18); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 18); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 18); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 14; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 18); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 18); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 18); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 6; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 18); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 18); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 18); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 18); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 18); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 18); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 18); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 18; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 18); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 18); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 18); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 10; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 18); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 18); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 18); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 2; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 18); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 18); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 6)) << 12; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 18); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 18); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 18); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 14)) << 4; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 18); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 18); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 14; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 18); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 18); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 18); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 6; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 18); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 18); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 18); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 18); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 18); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 10)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 18); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 18); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_19bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_19bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 19); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 19); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 19); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 12)) << 7; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 19); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 19); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 5)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 19); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 19); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 19); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 17)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 19); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 19); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 9; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 19); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 19); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 3)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 19); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 19); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 19); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 19); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 19); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 11; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 19); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 19); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 19); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 19); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 19); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 13)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 19); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 19); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 13; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 19); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 19); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 19); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 18)) << 1; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 19); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 19); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 11)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 19); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 19); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 15; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 19); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 19); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 19); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 3; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 19); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 19); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 10; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 19); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 19); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 17; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 19); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 19); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 19); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 5; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 19); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 19); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 7)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 19); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 19); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_20bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_20bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_21bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_21bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 21); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 21); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 21); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 21); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 21); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 19)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 21); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 21); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 21); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 21); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 17)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 21); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 21); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 5; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 21); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 21); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 21); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 21); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 7; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 21); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 21); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 21); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 21); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 9; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 21); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 21); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 11)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 21); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 21); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 11; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 21); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 21); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 9)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 21); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 21); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 13; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 21); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 21); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 7)) << 14; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 21); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 21); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 15; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 21); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 21); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 5)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 21); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 21); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 17; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 21); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 21); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 18; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 21); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 21); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 19; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 21); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 21); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 1)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 21); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 21); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_22bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_22bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 22); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 22); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 22); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 22); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 18; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 22); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 22); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 22); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 22); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 14; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 22); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 22); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 22); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 22); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 10; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 22); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 22); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 22); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 22); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 6; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 22); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 22); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 22); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 22); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 2; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 22); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 22; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 22); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 22); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 20; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 22); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 22); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 18; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 22); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 22); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 6)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 22); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 22); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 8)) << 14; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 22); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 22); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 10)) << 12; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 22); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 22); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 10; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 22); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 22); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 14)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 22); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 22); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 6; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 22); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 22); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 18)) << 4; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 22); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 22); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 2; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 22); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_23bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_23bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 23); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 23); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 5)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 23); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 23); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 10)) << 13; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 23); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 23); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 15)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 23); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 23); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 20)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 23); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 21; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 23); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 23); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 7)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 23); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 23); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 11; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 23); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 23); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 17)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 23); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 23); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 1; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 23); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 19; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 23); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 23); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 9)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 23); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 23); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 9; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 23); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 23); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 19)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 23); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 1)) << 22; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 23); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 23); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 17; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 23); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 23); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 11)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 23); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 23); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 7; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 23); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 23); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 21)) << 2; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 23); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 3)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 23); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 23); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 15; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 23); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 23); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 13)) << 10; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 23); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 23); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 5; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 23); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_24bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_24bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_25bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_25bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 25); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 25); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 11)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 25); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 25); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 25); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 17; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 25); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 25); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 19)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 25); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 5)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 25); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 25); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 9; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 25); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 23; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 25); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 25); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 25); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 25); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 1; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 25); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 15; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 25); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 25); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 21)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 25); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 7)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 25); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 25); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 7; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 25); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 21; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 25); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 25); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 15)) << 10; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 25); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 1)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 25); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 25); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 13; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 25); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 25); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 23)) << 2; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 25); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 25); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 25); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 5; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 25); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 19; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 25); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 25); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 17)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 25); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 3)) << 22; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 25); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 25); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 11; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 25); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_26bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_26bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 26); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 26); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 14)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 26); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 26); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 26); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 16)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 26); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 22; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 26); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 26); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 18)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 26); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 26); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 26); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 6; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 26); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 18; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 26); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 26); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 26); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 26); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 26); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 2; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 26); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 14; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 26); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 26; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 26); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 26); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 14)) << 12; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 26); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 2)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 26); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 26); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 10; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 26); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 22; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 26); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 26); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 26); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 6)) << 20; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 26); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 26); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 6; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 26); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 8)) << 18; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 26); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 26); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 22)) << 4; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 26); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 26); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 26); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 2; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 26); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 14; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 26); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_27bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_27bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 27); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 27); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 17)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 27); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 27); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 27); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 27); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 13; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 27); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 23; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 27); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 27); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 21)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 27); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 27); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 27); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 27); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 9; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 27); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 19; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 27); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 27); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 25)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 27); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 15)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 27); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 5)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 27); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 27); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 5; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 27); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 15; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 27); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 25; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 27); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 27); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 19)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 27); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 9)) << 18; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 27); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 27); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 1; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 27); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 11; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 27); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 21; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 27); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 27); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 23)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 27); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 14; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 27); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 3)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 27); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 27); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 7; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 27); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 17; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 27); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_28bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_28bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_29bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_29bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 29); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 29); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 23)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 29); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 17)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 29); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 11)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 29); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 5)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 29); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 29); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 28)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 29); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 22)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 29); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 16)) << 13; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 29); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 19; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 29); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 25; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 29); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 29); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 27)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 29); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 21)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 29); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 15)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 29); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 9)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 29); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 3)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 29); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 29); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 3; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 29); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 9; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 29); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 15; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 29); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 21; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 29); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 27; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 29); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 29); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 25)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 29); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 19)) << 10; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 29); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 13)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 29); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 7)) << 22; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 29); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 1)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 29); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 29); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 5; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 29); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 18)) << 11; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 29); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 17; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 29); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 23; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 29); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_30bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_30bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 30); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 30); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 26)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 30); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 30); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 30); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 30); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 30); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 30); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 30); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 30); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 2; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 30); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 6; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 30); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 10; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 30); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 14; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 30); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 18; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 30); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 22; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 30); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 26; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 30); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 30; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 30); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 30); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 4; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 30); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 22)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 30); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 12; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 30); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 14)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 30); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 10)) << 20; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 30); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 30); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 2)) << 28; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 30); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 30); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 2; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 30); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 6; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 30); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 10; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 30); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 14; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 30); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 18; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 30); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 22; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 30); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 26; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 30); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_31bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_31bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 31); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 31); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 29)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 31); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 27)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 31); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 25)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 31); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 23)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 31); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 21)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 31); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 19)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 31); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 17)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 31); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 15)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 31); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 13)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 31); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 11)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 31); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 9)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 31); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 7)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 31); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 5)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 31); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 3)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 31); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 1)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 31); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 31); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 1; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 31); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 3; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 31); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 5; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 31); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 7; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 31); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 9; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 31); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 11; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 31); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 13; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 31); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 15; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 31); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 17; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 31); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 19; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 31); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 21; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 31); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 23; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 31); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 25; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 31); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 27; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 31); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 2)) << 29; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 31); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_32bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_32bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_33bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_33bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 33); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 31; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 33); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 29; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 33); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 27; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 33); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 25; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 33); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 23; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 33); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 21; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 33); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 19; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 33); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 17; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 33); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 15; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 33); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 13; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 33); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 22)) << 11; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 33); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 9; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 33); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 26)) << 7; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 33); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 5; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 33); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 3; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 33); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 1; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 1)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 33); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 30; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 33); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 5)) << 28; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 33); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 7)) << 26; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 33); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 9)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 33); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 11)) << 22; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 33); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 20; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 33); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 15)) << 18; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 33); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 17)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 33); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 19)) << 14; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 33); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 21)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 33); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 23)) << 10; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 33); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 25)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 33); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 27)) << 6; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 33); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 29)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 33); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 31)) << 2; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_34bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_34bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 34); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 4)) << 30; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 34); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 8)) << 26; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 34); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 22; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 34); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 16)) << 18; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 34); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 20)) << 14; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 34); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 24)) << 10; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 34); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 28)) << 6; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 34); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 32)) << 2; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 2)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 34); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 28; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 34); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 34); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 20; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 34); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 34); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 12; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 34); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 34); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 4; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 0)) << 34; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 34); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 4)) << 30; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 34); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 26; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 34); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 12)) << 22; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 34); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 16)) << 18; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 34); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 14; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 34); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 24)) << 10; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 34); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 28)) << 6; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 34); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 2; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 2)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 34); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 6)) << 28; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 34); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 10)) << 24; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 34); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 14)) << 20; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 34); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 34); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 22)) << 12; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 34); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 26)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 34); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 4; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_35bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_35bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 35); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 29; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 35); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 23; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 35); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 17; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 35); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 11; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 35); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 30)) << 5; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 34; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 35); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 7)) << 28; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 35); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 22; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 35); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 19)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 35); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 25)) << 10; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 35); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 31)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 33; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 35); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 27; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 35); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 14)) << 21; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 35); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 15; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 35); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 9; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 35); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 3; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 35); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 26; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 35); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 15)) << 20; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 35); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 21)) << 14; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 35); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 27)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 35); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 33)) << 2; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 31; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 35); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 10)) << 25; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 35); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 19; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 35); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 22)) << 13; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 35); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 28)) << 7; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 35); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 1; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 5)) << 30; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 35); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 11)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 35); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 17)) << 18; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 35); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 23)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 35); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 6; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_36bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_36bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_37bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_37bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 37); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 10)) << 27; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 37); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 20)) << 17; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 37); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 30)) << 7; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 3)) << 34; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 37); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 13)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 37); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 23)) << 14; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 37); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 33)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 6)) << 31; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 37); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 16)) << 21; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 37); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 26)) << 11; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 37); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 36)) << 1; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 9)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 37); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 19)) << 18; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 37); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 29)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 2)) << 35; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 37); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 12)) << 25; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 37); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 22)) << 15; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 37); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 32)) << 5; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 5)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 37); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 15)) << 22; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 37); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 25)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 37); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 35)) << 2; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 8)) << 29; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 37); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 18)) << 19; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 37); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 28)) << 9; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 1)) << 36; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 37); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 11)) << 26; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 37); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 21)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 37); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 31)) << 6; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 4)) << 33; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 37); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 14)) << 23; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 37); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 24)) << 13; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 37); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 34)) << 3; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 7)) << 30; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 37); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 17)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 37); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 27)) << 10; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_38bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_38bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 38); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 12)) << 26; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 38); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 24)) << 14; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 38); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 36)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 28; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 38); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 22)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 38); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 34)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 30; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 38); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 20)) << 18; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 38); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 32)) << 6; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 38); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 18)) << 20; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 38); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 34; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 38); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 22; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 38); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 28)) << 10; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 36; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 38); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 38); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 12; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 0)) << 38; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 38); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 12)) << 26; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 38); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 14; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 38); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 36)) << 2; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 28; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 38); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 22)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 38); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 34)) << 4; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 8)) << 30; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 38); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 20)) << 18; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 38); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 6; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 6)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 38); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 20; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 38); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 30)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 34; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 38); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 16)) << 22; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 38); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 28)) << 10; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 2)) << 36; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 38); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 14)) << 24; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 38); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 12; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_39bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_39bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 39); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 14)) << 25; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 39); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 28)) << 11; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 36; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 39); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 17)) << 22; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 39); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 31)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 33; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 39); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 19; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 39); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 34)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 30; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 39); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 23)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 39); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 37)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 27; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 39); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 26)) << 13; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 1)) << 38; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 39); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 15)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 39); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 29)) << 10; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 35; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 39); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 21; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 39); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 7; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 7)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 39); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 21)) << 18; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 39); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 35)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 29; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 39); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 15; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 39); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 1; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 13)) << 26; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 39); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 27)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 2)) << 37; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 39); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 23; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 39); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 30)) << 9; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 5)) << 34; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 39); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 19)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 39); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 33)) << 6; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 31; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 39); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 22)) << 17; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 39); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 3; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 11)) << 28; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 39); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 25)) << 14; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_40bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_40bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_41bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_41bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 41); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 18)) << 23; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 41); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 36)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 13)) << 28; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 41); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 31)) << 10; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 8)) << 33; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 41); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 26)) << 15; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 3)) << 38; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 41); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 21)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 41); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 39)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 16)) << 25; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 41); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 34)) << 7; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 11)) << 30; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 41); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 29)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 6)) << 35; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 41); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 24)) << 17; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 1)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 41); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 19)) << 22; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 41); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 37)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 14)) << 27; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 41); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 32)) << 9; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 9)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 41); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 27)) << 14; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 4)) << 37; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 41); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 22)) << 19; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 41); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 40)) << 1; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 17)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 41); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 35)) << 6; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 12)) << 29; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 41); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 30)) << 11; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 7)) << 34; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 41); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 25)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 2)) << 39; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 41); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 20)) << 21; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 41); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 38)) << 3; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 15)) << 26; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 41); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 33)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 10)) << 31; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 41); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 28)) << 13; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 5)) << 36; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 41); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 23)) << 18; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_42bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_42bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 42); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 22; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 42); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 40)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 42); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 38)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 26; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 42); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 36)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 28; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 42); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 34)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 30; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 42); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 10; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 42); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 12; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 34; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 42); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 14; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 36; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 42); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 38; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 42); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 18; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 40; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 42); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 20; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 42; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 42); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 22; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 42); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 2; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 18)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 42); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 4; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 26; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 42); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 6; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 14)) << 28; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 42); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 12)) << 30; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 42); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 10; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 10)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 42); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 12; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 34; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 42); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 14; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 6)) << 36; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 42); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 4)) << 38; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 42); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 18; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 2)) << 40; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 42); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 20; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_43bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_43bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 43); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 22)) << 21; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 1)) << 42; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 43); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 23)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 2)) << 41; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 43); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 24)) << 19; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 3)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 43); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 25)) << 18; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 4)) << 39; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 43); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 26)) << 17; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 5)) << 38; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 43); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 27)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 6)) << 37; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 43); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 28)) << 15; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 7)) << 36; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 43); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 29)) << 14; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 8)) << 35; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 43); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 30)) << 13; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 9)) << 34; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 43); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 31)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 10)) << 33; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 43); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 32)) << 11; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 11)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 43); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 33)) << 10; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 12)) << 31; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 43); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 34)) << 9; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 13)) << 30; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 43); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 35)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 14)) << 29; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 43); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 36)) << 7; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 15)) << 28; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 43); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 37)) << 6; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 16)) << 27; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 43); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 38)) << 5; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 17)) << 26; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 43); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 39)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 18)) << 25; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 43); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 40)) << 3; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 19)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 43); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 41)) << 2; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 20)) << 23; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 43); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 42)) << 1; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 21)) << 22; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_44bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_44bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_45bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_45bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 45); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 26)) << 19; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 38; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 45); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 33)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 31; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 45); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 40)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 21)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 43; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 45); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 17; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 36; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 45); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 35)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 29; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 45); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 42)) << 3; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 23)) << 22; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 41; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 45); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 15; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 11)) << 34; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 45); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 37)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 27; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 45); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 1; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 25)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 39; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 45); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 13; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 45); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 39)) << 6; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 25; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 1)) << 44; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 45); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 27)) << 18; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 37; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 45); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 11; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 15)) << 30; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 45); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 41)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 22)) << 23; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 3)) << 42; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 45); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 10)) << 35; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 45); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 9; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 17)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 45); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 43)) << 2; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 21; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 5)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 45); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 31)) << 14; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 33; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 45); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 38)) << 7; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 19)) << 26; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_46bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_46bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 46); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 28)) << 18; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 10)) << 36; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 46); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 38)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 20)) << 26; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 44; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 46); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 30)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 34; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 46); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 40)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 42; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 46); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 32)) << 14; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 46); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 42)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 24)) << 22; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 40; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 46); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 34)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 30; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 46); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 44)) << 2; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 20; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 38; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 46); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 36)) << 10; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 28; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 0)) << 46; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 46); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 28)) << 18; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 10)) << 36; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 46); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 38)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 20)) << 26; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 2)) << 44; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 46); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 30)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 12)) << 34; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 46); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 40)) << 6; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 22)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 4)) << 42; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 46); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 32)) << 14; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 14)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 46); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 42)) << 4; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 24)) << 22; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 6)) << 40; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 46); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 34)) << 12; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 16)) << 30; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 46); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 44)) << 2; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 26)) << 20; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 8)) << 38; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 46); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 36)) << 10; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 18)) << 28; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_47bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_47bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 47); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 30)) << 17; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 13)) << 34; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 47); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 43)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 26)) << 21; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 9)) << 38; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 47); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 39)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 22)) << 25; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 5)) << 42; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 47); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 35)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 18)) << 29; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 1)) << 46; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 47); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 31)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 14)) << 33; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 47); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 44)) << 3; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 27)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 10)) << 37; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 47); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 40)) << 7; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 23)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 6)) << 41; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 47); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 36)) << 11; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 19)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 2)) << 45; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 47); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 32)) << 15; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 15)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 47); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 45)) << 2; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 28)) << 19; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 11)) << 36; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 47); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 41)) << 6; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 24)) << 23; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 7)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 47); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 37)) << 10; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 20)) << 27; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 3)) << 44; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 47); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 33)) << 14; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 16)) << 31; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 47); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 46)) << 1; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 29)) << 18; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 12)) << 35; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 47); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 42)) << 5; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 25)) << 22; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 8)) << 39; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 47); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 38)) << 9; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 21)) << 26; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 4)) << 43; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 47); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 34)) << 13; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 17)) << 30; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_48bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_48bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_49bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_49bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 49); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 34)) << 15; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 19)) << 30; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 45; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 49); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 38)) << 11; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 23)) << 26; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 41; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 49); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 42)) << 7; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 27)) << 22; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 37; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 49); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 46)) << 3; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 31)) << 18; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 33; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 1)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 49); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 35)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 29; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 5)) << 44; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 49); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 39)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 25; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 49); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 43)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 28)) << 21; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 13)) << 36; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 49); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 47)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 32)) << 17; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 17)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 2)) << 47; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 49); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 13; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 21)) << 28; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 6)) << 43; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 49); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 40)) << 9; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 25)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 10)) << 39; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 49); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 44)) << 5; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 20; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 14)) << 35; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 49); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 48)) << 1; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 33)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 18)) << 31; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 3)) << 46; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 49); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 37)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 27; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 7)) << 42; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 49); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 41)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 26)) << 23; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 11)) << 38; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 49); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 45)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 30)) << 19; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 15)) << 34; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_50bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_50bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 50); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 36)) << 14; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 28; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 42; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 50); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 44)) << 6; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 30)) << 20; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 34; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 48; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 50); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 38)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 26; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 40; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 50); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 46)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 32)) << 18; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 46; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 50); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 40)) << 10; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 38; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 50); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 48)) << 2; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 34)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 30; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 44; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 50); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 42)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 22; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 36; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 50; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 50); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 36)) << 14; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 22)) << 28; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 42; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 50); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 44)) << 6; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 30)) << 20; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 16)) << 34; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 2)) << 48; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 50); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 38)) << 12; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 26; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 10)) << 40; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 50); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 46)) << 4; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 18; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 18)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 4)) << 46; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 50); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 40)) << 10; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 26)) << 24; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 38; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 50); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 48)) << 2; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 34)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 20)) << 30; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 6)) << 44; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 50); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 42)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 28)) << 22; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 14)) << 36; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_51bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_51bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 51); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 38)) << 13; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 25)) << 26; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 39; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 51); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 50)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 37)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 24)) << 27; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 51); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 49)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 36)) << 15; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 23)) << 28; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 41; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 51); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 48)) << 3; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 35)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 29; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 42; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 51); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 47)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 34)) << 17; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 21)) << 30; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 43; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 51); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 46)) << 5; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 33)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 31; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 7)) << 44; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 51); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 45)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 19; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 19)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 6)) << 45; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 51); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 44)) << 7; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 31)) << 20; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 33; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 5)) << 46; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 51); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 43)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 21; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 17)) << 34; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 4)) << 47; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 51); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 42)) << 9; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 29)) << 22; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 16)) << 35; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 3)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 51); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 41)) << 10; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 28)) << 23; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 15)) << 36; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 2)) << 49; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 51); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 40)) << 11; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 27)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 14)) << 37; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 1)) << 50; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 51); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 39)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 26)) << 25; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 13)) << 38; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_52bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_52bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_53bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_53bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 53); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 42)) << 11; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 31)) << 22; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 20)) << 33; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 9)) << 44; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 53); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 51)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 40)) << 13; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 29)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 18)) << 35; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 7)) << 46; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 53); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 49)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 38)) << 15; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 27)) << 26; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 16)) << 37; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 5)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 53); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 47)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 36)) << 17; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 25)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 14)) << 39; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 3)) << 50; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 53); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 45)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 34)) << 19; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 23)) << 30; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 12)) << 41; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 1)) << 52; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 53); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 43)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 32)) << 21; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 21)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 10)) << 43; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 53); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 52)) << 1; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 41)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 30)) << 23; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 19)) << 34; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 8)) << 45; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 53); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 50)) << 3; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 39)) << 14; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 28)) << 25; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 17)) << 36; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 6)) << 47; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 53); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 48)) << 5; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 37)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 26)) << 27; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 15)) << 38; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 4)) << 49; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 53); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 46)) << 7; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 35)) << 18; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 24)) << 29; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 13)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 2)) << 51; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 53); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 44)) << 9; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 33)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 22)) << 31; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 11)) << 42; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_54bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_54bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 54); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 44)) << 10; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 34)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 30; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 40; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 50; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 54); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 48)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 38)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 26; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 36; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 46; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 54); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 52)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 42)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 22; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 42; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 52; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 54); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 46)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 36)) << 18; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 28; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 38; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 48; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 54); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 50)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 14; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 30)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 34; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 44; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 54; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 54); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 44)) << 10; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 20; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 24)) << 30; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 14)) << 40; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 50; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 54); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 48)) << 6; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 38)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 26; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 18)) << 36; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 8)) << 46; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 54); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 52)) << 2; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 42)) << 12; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 32)) << 22; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 42; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 2)) << 52; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 54); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 46)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 36)) << 18; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 26)) << 28; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 38; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 6)) << 48; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 54); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 50)) << 4; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 40)) << 14; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 30)) << 24; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 20)) << 34; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 10)) << 44; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_55bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_55bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 55); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 46)) << 9; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 37)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 28)) << 27; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 19)) << 36; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 45; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 54; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 55); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 47)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 38)) << 17; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 29)) << 26; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 35; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 11)) << 44; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 53; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 55); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 48)) << 7; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 39)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 25; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 21)) << 34; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 43; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 52; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 55); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 49)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 15; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 31)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 22)) << 33; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 42; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 51; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 55); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 50)) << 5; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 41)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 23; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 23)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 14)) << 41; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 5)) << 50; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 55); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 51)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 42)) << 13; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 33)) << 22; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 31; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 15)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 6)) << 49; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 55); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 52)) << 3; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 43)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 34)) << 21; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 25)) << 30; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 39; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 7)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 55); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 53)) << 2; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 44)) << 11; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 35)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 26)) << 29; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 17)) << 38; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 8)) << 47; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 55); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 54)) << 1; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 45)) << 10; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 36)) << 19; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 27)) << 28; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 18)) << 37; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 9)) << 46; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_56bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_56bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_57bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_57bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 57); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 50)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 43)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 36)) << 21; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 29)) << 28; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 22)) << 35; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 42; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 49; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 56; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 57); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 51)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 44)) << 13; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 37)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 27; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 23)) << 34; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 41; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 55; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 57); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 52)) << 5; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 45)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 38)) << 19; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 31)) << 26; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 33; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 17)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 47; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 3)) << 54; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 57); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 53)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 46)) << 11; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 39)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 25; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 25)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 39; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 11)) << 46; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 53; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 57); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 54)) << 3; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 47)) << 10; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 40)) << 17; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 33)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 31; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 19)) << 38; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 12)) << 45; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 5)) << 52; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 57); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 55)) << 2; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 48)) << 9; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 41)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 34)) << 23; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 27)) << 30; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 20)) << 37; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 13)) << 44; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 6)) << 51; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 57); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 56)) << 1; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 49)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 42)) << 15; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 35)) << 22; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 28)) << 29; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 21)) << 36; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 14)) << 43; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 7)) << 50; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_58bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_58bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 58); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 52)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 46)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 40)) << 18; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 34)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 28)) << 30; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 22)) << 36; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 16)) << 42; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 48; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 54; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 58); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 56)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 50)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 44)) << 14; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 38)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 32)) << 26; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 38; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 44; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 50; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 56; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 58); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 54)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 48)) << 10; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 42)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 36)) << 22; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 30)) << 28; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 34; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 18)) << 40; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 46; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 52; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 0)) << 58; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 58); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 52)) << 6; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 46)) << 12; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 40)) << 18; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 34)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 28)) << 30; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 22)) << 36; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 16)) << 42; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 10)) << 48; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 4)) << 54; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 58); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 56)) << 2; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 50)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 44)) << 14; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 38)) << 20; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 32)) << 26; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 26)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 20)) << 38; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 14)) << 44; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 8)) << 50; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 2)) << 56; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 58); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 54)) << 4; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 48)) << 10; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 42)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 36)) << 22; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 30)) << 28; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 24)) << 34; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 18)) << 40; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 12)) << 46; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 6)) << 52; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_59bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_59bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 59); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 54)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 49)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 44)) << 15; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 39)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 34)) << 25; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 29)) << 30; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 24)) << 35; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 19)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 14)) << 45; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 9)) << 50; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 4)) << 55; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 59); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 58)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 53)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 48)) << 11; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 43)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 38)) << 21; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 33)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 28)) << 31; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 23)) << 36; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 18)) << 41; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 13)) << 46; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 8)) << 51; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 3)) << 56; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 59); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 57)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 52)) << 7; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 47)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 42)) << 17; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 37)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 32)) << 27; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 27)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 22)) << 37; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 17)) << 42; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 12)) << 47; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 7)) << 52; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 2)) << 57; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 59); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 56)) << 3; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 51)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 46)) << 13; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 41)) << 18; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 36)) << 23; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 31)) << 28; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 26)) << 33; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 21)) << 38; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 16)) << 43; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 11)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 6)) << 53; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 1)) << 58; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 59); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 55)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 50)) << 9; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 45)) << 14; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 40)) << 19; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 35)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 30)) << 29; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 25)) << 34; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 20)) << 39; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 15)) << 44; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 10)) << 49; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 5)) << 54; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_60bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_60bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_61bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_61bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 61); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 58)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 55)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 52)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 49)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 46)) << 15; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 43)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 40)) << 21; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 37)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 34)) << 27; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 31)) << 30; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 28)) << 33; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 25)) << 36; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 22)) << 39; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 19)) << 42; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 16)) << 45; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 13)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 10)) << 51; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 7)) << 54; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 4)) << 57; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 1)) << 60; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 61); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 59)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 56)) << 5; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 53)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 50)) << 11; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 47)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 44)) << 17; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 41)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 38)) << 23; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 35)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 32)) << 29; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 29)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 26)) << 35; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 23)) << 38; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 20)) << 41; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 17)) << 44; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 14)) << 47; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 11)) << 50; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 8)) << 53; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 5)) << 56; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 2)) << 59; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 61); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 60)) << 1; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 57)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 54)) << 7; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 51)) << 10; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 48)) << 13; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 45)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 42)) << 19; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 39)) << 22; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 36)) << 25; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 33)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 30)) << 31; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 27)) << 34; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 24)) << 37; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 21)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 18)) << 43; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 15)) << 46; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 12)) << 49; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 9)) << 52; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 6)) << 55; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 3)) << 58; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 61); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_62bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_62bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 62); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 60)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 58)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 56)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 54)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 52)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 50)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 48)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 46)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 44)) << 18; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 42)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 40)) << 22; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 38)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 36)) << 26; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 34)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 32)) << 30; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 34; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 36; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 38; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 40; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 42; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 44; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 46; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 48; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 50; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 52; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 54; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 56; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 58; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 2)) << 60; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 0)) << 62; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 62); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 60)) << 2; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 58)) << 4; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 56)) << 6; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 54)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 52)) << 10; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 50)) << 12; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 48)) << 14; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 46)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 44)) << 18; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 42)) << 20; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 40)) << 22; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 38)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 36)) << 26; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 34)) << 28; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 32)) << 30; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 30)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 28)) << 34; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 26)) << 36; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 24)) << 38; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 22)) << 40; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 20)) << 42; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 18)) << 44; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 16)) << 46; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 14)) << 48; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 12)) << 50; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 10)) << 52; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 8)) << 54; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 6)) << 56; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 4)) << 58; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 61]; tmp |= (src & MASK(uint64_t, 2)) << 60; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_63bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_63bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 63); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 62)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 61)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 60)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 59)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 58)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 57)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 56)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 55)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 54)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 53)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 52)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 51)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 50)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 49)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 48)) << 15; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 47)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 46)) << 17; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 45)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 19; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 43)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 42)) << 21; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 41)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 23; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 39)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 25; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 37)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 27; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 35)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 29; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 33)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 31; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 31)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 33; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 34; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 35; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 27)) << 36; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 37; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 25)) << 38; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 39; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 23)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 41; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 21)) << 42; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 20)) << 43; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 19)) << 44; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 18)) << 45; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 17)) << 46; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 47; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 15)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 14)) << 49; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 13)) << 50; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 12)) << 51; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 11)) << 52; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 10)) << 53; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 9)) << 54; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 8)) << 55; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 7)) << 56; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 6)) << 57; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 5)) << 58; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 4)) << 59; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 3)) << 60; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 61); src = in[lane + LANE_COUNT * 61]; tmp |= (src & MASK(uint64_t, 2)) << 61; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); src = in[lane + LANE_COUNT * 62]; tmp |= (src & MASK(uint64_t, 1)) << 62; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 63); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_64bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_64bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; - out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane]; - out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane]; - out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane]; - out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane]; - out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane]; - out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane]; - out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane]; - out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane]; - out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane]; - out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane]; - out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane]; - out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane]; - out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane]; - out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane]; - out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane]; - out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane]; - out[INDEX(32, lane)] = in[LANE_COUNT * 32 + lane]; - out[INDEX(33, lane)] = in[LANE_COUNT * 33 + lane]; - out[INDEX(34, lane)] = in[LANE_COUNT * 34 + lane]; - out[INDEX(35, lane)] = in[LANE_COUNT * 35 + lane]; - out[INDEX(36, lane)] = in[LANE_COUNT * 36 + lane]; - out[INDEX(37, lane)] = in[LANE_COUNT * 37 + lane]; - out[INDEX(38, lane)] = in[LANE_COUNT * 38 + lane]; - out[INDEX(39, lane)] = in[LANE_COUNT * 39 + lane]; - out[INDEX(40, lane)] = in[LANE_COUNT * 40 + lane]; - out[INDEX(41, lane)] = in[LANE_COUNT * 41 + lane]; - out[INDEX(42, lane)] = in[LANE_COUNT * 42 + lane]; - out[INDEX(43, lane)] = in[LANE_COUNT * 43 + lane]; - out[INDEX(44, lane)] = in[LANE_COUNT * 44 + lane]; - out[INDEX(45, lane)] = in[LANE_COUNT * 45 + lane]; - out[INDEX(46, lane)] = in[LANE_COUNT * 46 + lane]; - out[INDEX(47, lane)] = in[LANE_COUNT * 47 + lane]; - out[INDEX(48, lane)] = in[LANE_COUNT * 48 + lane]; - out[INDEX(49, lane)] = in[LANE_COUNT * 49 + lane]; - out[INDEX(50, lane)] = in[LANE_COUNT * 50 + lane]; - out[INDEX(51, lane)] = in[LANE_COUNT * 51 + lane]; - out[INDEX(52, lane)] = in[LANE_COUNT * 52 + lane]; - out[INDEX(53, lane)] = in[LANE_COUNT * 53 + lane]; - out[INDEX(54, lane)] = in[LANE_COUNT * 54 + lane]; - out[INDEX(55, lane)] = in[LANE_COUNT * 55 + lane]; - out[INDEX(56, lane)] = in[LANE_COUNT * 56 + lane]; - out[INDEX(57, lane)] = in[LANE_COUNT * 57 + lane]; - out[INDEX(58, lane)] = in[LANE_COUNT * 58 + lane]; - out[INDEX(59, lane)] = in[LANE_COUNT * 59 + lane]; - out[INDEX(60, lane)] = in[LANE_COUNT * 60 + lane]; - out[INDEX(61, lane)] = in[LANE_COUNT * 61 + lane]; - out[INDEX(62, lane)] = in[LANE_COUNT * 62 + lane]; - out[INDEX(63, lane)] = in[LANE_COUNT * 63 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; + out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane] + reference; + out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane] + reference; + out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane] + reference; + out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane] + reference; + out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane] + reference; + out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane] + reference; + out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane] + reference; + out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane] + reference; + out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane] + reference; + out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane] + reference; + out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane] + reference; + out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane] + reference; + out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane] + reference; + out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane] + reference; + out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane] + reference; + out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane] + reference; + out[INDEX(32, lane)] = in[LANE_COUNT * 32 + lane] + reference; + out[INDEX(33, lane)] = in[LANE_COUNT * 33 + lane] + reference; + out[INDEX(34, lane)] = in[LANE_COUNT * 34 + lane] + reference; + out[INDEX(35, lane)] = in[LANE_COUNT * 35 + lane] + reference; + out[INDEX(36, lane)] = in[LANE_COUNT * 36 + lane] + reference; + out[INDEX(37, lane)] = in[LANE_COUNT * 37 + lane] + reference; + out[INDEX(38, lane)] = in[LANE_COUNT * 38 + lane] + reference; + out[INDEX(39, lane)] = in[LANE_COUNT * 39 + lane] + reference; + out[INDEX(40, lane)] = in[LANE_COUNT * 40 + lane] + reference; + out[INDEX(41, lane)] = in[LANE_COUNT * 41 + lane] + reference; + out[INDEX(42, lane)] = in[LANE_COUNT * 42 + lane] + reference; + out[INDEX(43, lane)] = in[LANE_COUNT * 43 + lane] + reference; + out[INDEX(44, lane)] = in[LANE_COUNT * 44 + lane] + reference; + out[INDEX(45, lane)] = in[LANE_COUNT * 45 + lane] + reference; + out[INDEX(46, lane)] = in[LANE_COUNT * 46 + lane] + reference; + out[INDEX(47, lane)] = in[LANE_COUNT * 47 + lane] + reference; + out[INDEX(48, lane)] = in[LANE_COUNT * 48 + lane] + reference; + out[INDEX(49, lane)] = in[LANE_COUNT * 49 + lane] + reference; + out[INDEX(50, lane)] = in[LANE_COUNT * 50 + lane] + reference; + out[INDEX(51, lane)] = in[LANE_COUNT * 51 + lane] + reference; + out[INDEX(52, lane)] = in[LANE_COUNT * 52 + lane] + reference; + out[INDEX(53, lane)] = in[LANE_COUNT * 53 + lane] + reference; + out[INDEX(54, lane)] = in[LANE_COUNT * 54 + lane] + reference; + out[INDEX(55, lane)] = in[LANE_COUNT * 55 + lane] + reference; + out[INDEX(56, lane)] = in[LANE_COUNT * 56 + lane] + reference; + out[INDEX(57, lane)] = in[LANE_COUNT * 57 + lane] + reference; + out[INDEX(58, lane)] = in[LANE_COUNT * 58 + lane] + reference; + out[INDEX(59, lane)] = in[LANE_COUNT * 59 + lane] + reference; + out[INDEX(60, lane)] = in[LANE_COUNT * 60 + lane] + reference; + out[INDEX(61, lane)] = in[LANE_COUNT * 61 + lane] + reference; + out[INDEX(62, lane)] = in[LANE_COUNT * 62 + lane] + reference; + out[INDEX(63, lane)] = in[LANE_COUNT * 63 + lane] + reference; } /// Runtime dispatch to the optimized lane decoder for the given bit width. @@ -12693,1043 +12692,1043 @@ __device__ inline void bit_unpack_64_lane( } } -__device__ void _bit_unpack_64_0bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_0bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_0bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_0bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_0bw_16t(in, out, thread_idx); + _bit_unpack_64_0bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_1bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_1bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_1bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_1bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_1bw_16t(in, out, thread_idx); + _bit_unpack_64_1bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_2bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_2bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_2bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_2bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_2bw_16t(in, out, thread_idx); + _bit_unpack_64_2bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_3bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_3bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_3bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_3bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_3bw_16t(in, out, thread_idx); + _bit_unpack_64_3bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_4bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_4bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_4bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_4bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_4bw_16t(in, out, thread_idx); + _bit_unpack_64_4bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_5bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_5bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_5bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_5bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_5bw_16t(in, out, thread_idx); + _bit_unpack_64_5bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_6bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_6bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_6bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_6bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_6bw_16t(in, out, thread_idx); + _bit_unpack_64_6bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_7bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_7bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_7bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_7bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_7bw_16t(in, out, thread_idx); + _bit_unpack_64_7bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_8bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_8bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_8bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_8bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_8bw_16t(in, out, thread_idx); + _bit_unpack_64_8bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_9bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_9bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_9bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_9bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_9bw_16t(in, out, thread_idx); + _bit_unpack_64_9bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_10bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_10bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_10bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_10bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_10bw_16t(in, out, thread_idx); + _bit_unpack_64_10bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_11bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_11bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_11bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_11bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_11bw_16t(in, out, thread_idx); + _bit_unpack_64_11bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_12bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_12bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_12bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_12bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_12bw_16t(in, out, thread_idx); + _bit_unpack_64_12bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_13bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_13bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_13bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_13bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_13bw_16t(in, out, thread_idx); + _bit_unpack_64_13bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_14bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_14bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_14bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_14bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_14bw_16t(in, out, thread_idx); + _bit_unpack_64_14bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_15bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_15bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_15bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_15bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_15bw_16t(in, out, thread_idx); + _bit_unpack_64_15bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_16bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_16bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_16bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_16bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_16bw_16t(in, out, thread_idx); + _bit_unpack_64_16bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_17bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_17bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_17bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_17bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_17bw_16t(in, out, thread_idx); + _bit_unpack_64_17bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_18bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_18bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_18bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_18bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_18bw_16t(in, out, thread_idx); + _bit_unpack_64_18bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_19bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_19bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_19bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_19bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_19bw_16t(in, out, thread_idx); + _bit_unpack_64_19bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_20bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_20bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_20bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_20bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_20bw_16t(in, out, thread_idx); + _bit_unpack_64_20bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_21bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_21bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_21bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_21bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_21bw_16t(in, out, thread_idx); + _bit_unpack_64_21bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_22bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_22bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_22bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_22bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_22bw_16t(in, out, thread_idx); + _bit_unpack_64_22bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_23bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_23bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_23bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_23bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_23bw_16t(in, out, thread_idx); + _bit_unpack_64_23bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_24bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_24bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_24bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_24bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_24bw_16t(in, out, thread_idx); + _bit_unpack_64_24bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_25bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_25bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_25bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_25bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_25bw_16t(in, out, thread_idx); + _bit_unpack_64_25bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_26bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_26bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_26bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_26bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_26bw_16t(in, out, thread_idx); + _bit_unpack_64_26bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_27bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_27bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_27bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_27bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_27bw_16t(in, out, thread_idx); + _bit_unpack_64_27bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_28bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_28bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_28bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_28bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_28bw_16t(in, out, thread_idx); + _bit_unpack_64_28bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_29bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_29bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_29bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_29bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_29bw_16t(in, out, thread_idx); + _bit_unpack_64_29bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_30bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_30bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_30bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_30bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_30bw_16t(in, out, thread_idx); + _bit_unpack_64_30bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_31bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_31bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_31bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_31bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_31bw_16t(in, out, thread_idx); + _bit_unpack_64_31bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_32bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_32bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_32bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_32bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_32bw_16t(in, out, thread_idx); + _bit_unpack_64_32bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_33bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_33bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_33bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_33bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 33 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_33bw_16t(in, out, thread_idx); + _bit_unpack_64_33bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_34bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_34bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_34bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_34bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 34 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_34bw_16t(in, out, thread_idx); + _bit_unpack_64_34bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_35bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_35bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_35bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_35bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 35 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_35bw_16t(in, out, thread_idx); + _bit_unpack_64_35bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_36bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_36bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_36bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_36bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 36 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_36bw_16t(in, out, thread_idx); + _bit_unpack_64_36bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_37bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_37bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_37bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_37bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 37 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_37bw_16t(in, out, thread_idx); + _bit_unpack_64_37bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_38bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_38bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_38bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_38bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 38 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_38bw_16t(in, out, thread_idx); + _bit_unpack_64_38bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_39bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_39bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_39bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_39bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 39 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_39bw_16t(in, out, thread_idx); + _bit_unpack_64_39bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_40bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_40bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_40bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_40bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 40 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_40bw_16t(in, out, thread_idx); + _bit_unpack_64_40bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_41bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_41bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_41bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_41bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 41 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_41bw_16t(in, out, thread_idx); + _bit_unpack_64_41bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_42bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_42bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_42bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_42bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 42 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_42bw_16t(in, out, thread_idx); + _bit_unpack_64_42bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_43bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_43bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_43bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_43bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 43 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_43bw_16t(in, out, thread_idx); + _bit_unpack_64_43bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_44bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_44bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_44bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_44bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 44 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_44bw_16t(in, out, thread_idx); + _bit_unpack_64_44bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_45bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_45bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_45bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_45bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 45 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_45bw_16t(in, out, thread_idx); + _bit_unpack_64_45bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_46bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_46bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_46bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_46bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 46 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_46bw_16t(in, out, thread_idx); + _bit_unpack_64_46bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_47bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_47bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_47bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_47bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 47 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_47bw_16t(in, out, thread_idx); + _bit_unpack_64_47bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_48bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_48bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_48bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_48bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 48 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_48bw_16t(in, out, thread_idx); + _bit_unpack_64_48bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_49bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_49bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_49bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_49bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 49 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_49bw_16t(in, out, thread_idx); + _bit_unpack_64_49bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_50bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_50bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_50bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_50bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 50 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_50bw_16t(in, out, thread_idx); + _bit_unpack_64_50bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_51bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_51bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_51bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_51bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 51 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_51bw_16t(in, out, thread_idx); + _bit_unpack_64_51bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_52bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_52bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_52bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_52bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 52 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_52bw_16t(in, out, thread_idx); + _bit_unpack_64_52bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_53bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_53bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_53bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_53bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 53 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_53bw_16t(in, out, thread_idx); + _bit_unpack_64_53bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_54bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_54bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_54bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_54bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 54 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_54bw_16t(in, out, thread_idx); + _bit_unpack_64_54bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_55bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_55bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_55bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_55bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 55 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_55bw_16t(in, out, thread_idx); + _bit_unpack_64_55bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_56bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_56bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_56bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_56bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 56 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_56bw_16t(in, out, thread_idx); + _bit_unpack_64_56bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_57bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_57bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_57bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_57bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 57 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_57bw_16t(in, out, thread_idx); + _bit_unpack_64_57bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_58bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_58bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_58bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_58bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 58 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_58bw_16t(in, out, thread_idx); + _bit_unpack_64_58bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_59bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_59bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_59bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_59bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 59 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_59bw_16t(in, out, thread_idx); + _bit_unpack_64_59bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_60bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_60bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_60bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_60bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 60 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_60bw_16t(in, out, thread_idx); + _bit_unpack_64_60bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_61bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_61bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_61bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_61bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 61 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_61bw_16t(in, out, thread_idx); + _bit_unpack_64_61bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_62bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_62bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_62bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_62bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 62 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_62bw_16t(in, out, thread_idx); + _bit_unpack_64_62bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_63bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_63bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_63bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_63bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 63 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_63bw_16t(in, out, thread_idx); + _bit_unpack_64_63bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_64bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_64_64bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, int thread_idx) { __shared__ uint64_t shared_out[1024]; - _bit_unpack_64_64bw_lane(in, shared_out, thread_idx * 1 + 0); + _bit_unpack_64_64bw_lane(in, shared_out, reference, thread_idx * 1 + 0); for (int i = 0; i < 64; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, const uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 64 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_64bw_16t(in, out, thread_idx); + _bit_unpack_64_64bw_16t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_8.cu b/vortex-cuda/kernels/src/bit_unpack_8.cu index 311d9784018..879247ac941 100644 --- a/vortex-cuda/kernels/src/bit_unpack_8.cu +++ b/vortex-cuda/kernels/src/bit_unpack_8.cu @@ -4,241 +4,240 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_8_0bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_0bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; - uint8_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; } -__device__ void _bit_unpack_8_1bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_1bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_2bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_2bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 0)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_3bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_3bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 1)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 2)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_4bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_4bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_5bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_5bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 2)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 4)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 1)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_6bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_6bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 4)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 2)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 0)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint8_t, 2)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_7bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_7bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 6)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 5)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 4)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint8_t, 2)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint8_t, 1)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_8bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_8bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; } /// Runtime dispatch to the optimized lane decoder for the given bit width. @@ -261,174 +260,174 @@ __device__ inline void bit_unpack_8_lane( } } -__device__ void _bit_unpack_8_0bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_0bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_0bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_0bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_0bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_0bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_0bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_0bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_0bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_0bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_0bw_32t(in, out, thread_idx); + _bit_unpack_8_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_1bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_1bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_1bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_1bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_1bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_1bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_1bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_1bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_1bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_1bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_1bw_32t(in, out, thread_idx); + _bit_unpack_8_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_2bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_2bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_2bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_2bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_2bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_2bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_2bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_2bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_2bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_2bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_2bw_32t(in, out, thread_idx); + _bit_unpack_8_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_3bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_3bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_3bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_3bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_3bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_3bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_3bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_3bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_3bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_3bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_3bw_32t(in, out, thread_idx); + _bit_unpack_8_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_4bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_4bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_4bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_4bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_4bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_4bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_4bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_4bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_4bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_4bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_4bw_32t(in, out, thread_idx); + _bit_unpack_8_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_5bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_5bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_5bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_5bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_5bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_5bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_5bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_5bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_5bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_5bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_5bw_32t(in, out, thread_idx); + _bit_unpack_8_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_6bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_6bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_6bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_6bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_6bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_6bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_6bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_6bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_6bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_6bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_6bw_32t(in, out, thread_idx); + _bit_unpack_8_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_7bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_7bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_7bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_7bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_7bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_7bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_7bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_7bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_7bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_7bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_7bw_32t(in, out, thread_idx); + _bit_unpack_8_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_8bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { +__device__ void _bit_unpack_8_8bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, int thread_idx) { __shared__ uint8_t shared_out[1024]; - _bit_unpack_8_8bw_lane(in, shared_out, thread_idx * 4 + 0); - _bit_unpack_8_8bw_lane(in, shared_out, thread_idx * 4 + 1); - _bit_unpack_8_8bw_lane(in, shared_out, thread_idx * 4 + 2); - _bit_unpack_8_8bw_lane(in, shared_out, thread_idx * 4 + 3); + _bit_unpack_8_8bw_lane(in, shared_out, reference, thread_idx * 4 + 0); + _bit_unpack_8_8bw_lane(in, shared_out, reference, thread_idx * 4 + 1); + _bit_unpack_8_8bw_lane(in, shared_out, reference, thread_idx * 4 + 2); + _bit_unpack_8_8bw_lane(in, shared_out, reference, thread_idx * 4 + 3); for (int i = 0; i < 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } } -extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, const uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_8bw_32t(in, out, thread_idx); + _bit_unpack_8_8bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index a4455ab536c..fcc23b08c92 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -58,7 +58,7 @@ impl CudaExecute for BitPackedExecutor { Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?; match_each_integer_ptype!(array.ptype(), |A| { - decode_bitpacked::(array, ctx).await + decode_bitpacked::(array, 0, ctx).await }) } } @@ -91,6 +91,7 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe pub(crate) async fn decode_bitpacked( array: BitPackedArray, + reference: A, ctx: &mut CudaExecutionCtx, ) -> VortexResult where @@ -128,7 +129,7 @@ where let config = bitpacked_cuda_launch_config(output_width, len)?; ctx.launch_kernel_config(&cuda_function, config, len, |args| { - args.arg(&input_view).arg(&output_view); + args.arg(&input_view).arg(&output_view).arg(&reference); })?; let output_handle = match patches { diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 6ec3bc5772b..03930eca25d 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -9,22 +9,23 @@ use cudarc::driver::PushKernelArg; use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; -use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; +use vortex_array::arrays::{PrimitiveArray, SliceVTable}; use vortex_array::buffer::BufferHandle; use vortex_cuda_macros::cuda_tests; -use vortex_dtype::NativePType; use vortex_dtype::match_each_native_simd_ptype; +use vortex_dtype::{NativePType, match_each_integer_ptype, match_each_native_ptype}; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_fastlanes::FoRArray; use vortex_fastlanes::FoRVTable; +use vortex_fastlanes::{BitPackedVTable, FoRArray}; use crate::CudaBufferExt; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; +use crate::kernel::encodings::bitpacked::decode_bitpacked; /// CUDA decoder for frame-of-reference. #[derive(Debug)] @@ -49,6 +50,29 @@ impl CudaExecute for FoRExecutor { ) -> VortexResult { let array = Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected FoRArray"))?; + // Fuse: FOR/BP + if let Some(bitpacked) = array.encoded().as_opt::() { + match_each_integer_ptype!(array.ptype(), |P| { + let reference: P = P::try_from(array.reference_scalar())?; + return decode_bitpacked(bitpacked.clone(), reference, ctx).await; + }); + } + + // Fuse FOR/BP/Slice + if let Some(sliced) = array.encoded().as_opt::() + && let Some(bitpacked) = sliced.child().as_opt::() + { + match_each_integer_ptype!(array.ptype(), |P| { + let reference: P = P::try_from(array.reference_scalar())?; + return decode_bitpacked(bitpacked.clone(), reference, ctx) + .await? + .into_primitive() + .slice(sliced.slice_range().clone())? + .to_canonical(); + }); + } + + // Fallback: execute child then apply frame-of-reference offset match_each_native_simd_ptype!(array.ptype(), |P| { decode_for::

(array, ctx).await }) } } From 2578ba386d24f14add7601ec7c527c23aeb2a5a8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 13 Feb 2026 09:44:46 -0500 Subject: [PATCH 11/11] save Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/encodings/bitpacked.rs | 19 +++++++++++++++++-- vortex-cuda/src/kernel/encodings/for_.rs | 10 +++++++--- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index fcc23b08c92..c5a39c710e8 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -1,7 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::cmp::max; use std::fmt::Debug; +use std::ops::Range; use std::sync::Arc; use async_trait::async_trait; @@ -57,8 +59,9 @@ impl CudaExecute for BitPackedExecutor { let array = Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?; + let len = array.len(); match_each_integer_ptype!(array.ptype(), |A| { - decode_bitpacked::(array, 0, ctx).await + decode_bitpacked::(array, 0, (0..len), ctx).await }) } } @@ -92,6 +95,7 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe pub(crate) async fn decode_bitpacked( array: BitPackedArray, reference: A, + range: Range, ctx: &mut CudaExecutionCtx, ) -> VortexResult where @@ -110,7 +114,18 @@ where vortex_ensure!(len > 0, "Non empty array"); let offset = offset as usize; - let device_input: BufferHandle = if packed.is_on_device() { + let offset_start = range.start + offset; + let offset_stop = range.end + offset; + let offset = offset_start % 1024; + let block_start = max(0, offset_start - offset); + let block_stop = offset_stop.div_ceil(1024) * 1024; + + let encoded_start = (block_start / 8) * bit_width as usize; + let encoded_stop = (block_stop / 8) * bit_width as usize; + + let sliced_packed = packed.slice(encoded_start..encoded_stop); + + let device_input: BufferHandle = if sliced_packed.is_on_device() { packed } else { ctx.move_to_device(packed)?.await? diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 03930eca25d..8b02f9f6438 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -9,17 +9,21 @@ use cudarc::driver::PushKernelArg; use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; +use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; -use vortex_array::arrays::{PrimitiveArray, SliceVTable}; +use vortex_array::arrays::SliceVTable; use vortex_array::buffer::BufferHandle; use vortex_cuda_macros::cuda_tests; +use vortex_dtype::NativePType; +use vortex_dtype::match_each_integer_ptype; +use vortex_dtype::match_each_native_ptype; use vortex_dtype::match_each_native_simd_ptype; -use vortex_dtype::{NativePType, match_each_integer_ptype, match_each_native_ptype}; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; +use vortex_fastlanes::BitPackedVTable; +use vortex_fastlanes::FoRArray; use vortex_fastlanes::FoRVTable; -use vortex_fastlanes::{BitPackedVTable, FoRArray}; use crate::CudaBufferExt; use crate::executor::CudaArrayExt;