diff --git a/vortex-cuda/kernels/src/.clang-format b/vortex-cuda/.clang-format similarity index 81% rename from vortex-cuda/kernels/src/.clang-format rename to vortex-cuda/.clang-format index d6b5a383e19..6a25679451b 100644 --- a/vortex-cuda/kernels/src/.clang-format +++ b/vortex-cuda/.clang-format @@ -14,9 +14,11 @@ DerivePointerAlignment: false PointerAlignment: Right AlignConsecutiveMacros: true AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +BinPackArguments: false +BinPackParameters: false AlignAfterOpenBracket: Align SpaceBeforeCpp11BracedList: true SpaceBeforeCtorInitializerColon: true @@ -32,4 +34,3 @@ IncludeBlocks: Regroup Language: Cpp AccessModifierOffset: -4 --- - diff --git a/vortex-cuda/cub/kernels/filter.cu b/vortex-cuda/cub/kernels/filter.cu index ecf11b32b9f..42f55b24540 100644 --- a/vortex-cuda/cub/kernels/filter.cu +++ b/vortex-cuda/cub/kernels/filter.cu @@ -4,10 +4,10 @@ // CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering. #include -#include -#include #include #include +#include +#include // i256 type typedef struct { @@ -17,11 +17,10 @@ typedef struct { // Bit extraction functor for TransformInputIterator struct BitExtractor { - const uint8_t* packed; + const uint8_t *packed; uint64_t bit_offset; - __host__ __device__ inline - uint8_t operator()(int64_t idx) const { + __host__ __device__ inline uint8_t operator()(int64_t idx) const { uint64_t actual_bit = bit_offset + static_cast(idx); uint64_t byte_idx = actual_bit / 8; uint32_t bit_idx = actual_bit % 8; @@ -30,31 +29,27 @@ struct BitExtractor { }; /// Type alias for the packed bit iterator. -using PackedBitIterator = thrust::transform_iterator< - BitExtractor, - thrust::counting_iterator ->; +using PackedBitIterator = thrust::transform_iterator>; // CUB DeviceSelect::Flagged - Query temp storage size -template -static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) { +template +static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) { size_t bytes = 0; - cudaError_t err = cub::DeviceSelect::Flagged( - nullptr, bytes, - static_cast(nullptr), - static_cast(nullptr), - static_cast(nullptr), - static_cast(nullptr), - num_items - ); + cudaError_t err = cub::DeviceSelect::Flagged(nullptr, + bytes, + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + static_cast(nullptr), + num_items); *temp_bytes = bytes; return err; } -#define DEFINE_TEMP_SIZE(suffix, Type) \ -extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \ - return filter_temp_size_impl(temp_bytes, n); \ -} +#define DEFINE_TEMP_SIZE(suffix, Type) \ + extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) { \ + return filter_temp_size_impl(temp_bytes, n); \ + } DEFINE_TEMP_SIZE(u8, uint8_t) DEFINE_TEMP_SIZE(i8, int8_t) @@ -70,33 +65,43 @@ DEFINE_TEMP_SIZE(i128, __int128_t) DEFINE_TEMP_SIZE(i256, __int256_t) // CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element) -template -static cudaError_t filter_bytemask_impl( - void* d_temp, - size_t temp_bytes, - const T* d_in, - const uint8_t* d_flags, - T* d_out, - int64_t* d_num_selected, - int64_t num_items, - cudaStream_t stream -) { - return cub::DeviceSelect::Flagged( - d_temp, temp_bytes, - d_in, d_flags, d_out, d_num_selected, - num_items, stream - ); +template +static cudaError_t filter_bytemask_impl(void *d_temp, + size_t temp_bytes, + const T *d_in, + const uint8_t *d_flags, + T *d_out, + int64_t *d_num_selected, + int64_t num_items, + cudaStream_t stream) { + return cub::DeviceSelect::Flagged(d_temp, + temp_bytes, + d_in, + d_flags, + d_out, + d_num_selected, + num_items, + stream); } -#define DEFINE_FILTER_BYTEMASK(suffix, Type) \ -extern "C" cudaError_t filter_bytemask_##suffix( \ - void* d_temp, size_t temp_bytes, \ - const Type* d_in, const uint8_t* d_flags, \ - Type* d_out, int64_t* d_num_selected, \ - int64_t num_items, cudaStream_t stream \ -) { \ - return filter_bytemask_impl(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \ -} +#define DEFINE_FILTER_BYTEMASK(suffix, Type) \ + extern "C" cudaError_t filter_bytemask_##suffix(void *d_temp, \ + size_t temp_bytes, \ + const Type *d_in, \ + const uint8_t *d_flags, \ + Type *d_out, \ + int64_t *d_num_selected, \ + int64_t num_items, \ + cudaStream_t stream) { \ + return filter_bytemask_impl(d_temp, \ + temp_bytes, \ + d_in, \ + d_flags, \ + d_out, \ + d_num_selected, \ + num_items, \ + stream); \ + } DEFINE_FILTER_BYTEMASK(u8, uint8_t) DEFINE_FILTER_BYTEMASK(i8, int8_t) @@ -125,41 +130,51 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t) // d_num_selected: Output count of selected elements // num_items: Number of input elements // stream: CUDA stream -template -static cudaError_t filter_bitmask_impl( - void* d_temp, - size_t temp_bytes, - const T* d_in, - const uint8_t* d_bitmask, - uint64_t bit_offset, - T* d_out, - int64_t* d_num_selected, - int64_t num_items, - cudaStream_t stream -) { +template +static cudaError_t filter_bitmask_impl(void *d_temp, + size_t temp_bytes, + const T *d_in, + const uint8_t *d_bitmask, + uint64_t bit_offset, + T *d_out, + int64_t *d_num_selected, + int64_t num_items, + cudaStream_t stream) { // Create a transform iterator to read packed bits. - BitExtractor extractor{d_bitmask, bit_offset}; + BitExtractor extractor {d_bitmask, bit_offset}; thrust::counting_iterator counting_iter(0); PackedBitIterator flag_iter(counting_iter, extractor); - return cub::DeviceSelect::Flagged( - d_temp, temp_bytes, - d_in, flag_iter, d_out, d_num_selected, - num_items, stream - ); + return cub::DeviceSelect::Flagged(d_temp, + temp_bytes, + d_in, + flag_iter, + d_out, + d_num_selected, + num_items, + stream); } -#define DEFINE_FILTER_BITMASK(suffix, Type) \ -extern "C" cudaError_t filter_bitmask_##suffix( \ - void* d_temp, size_t temp_bytes, \ - const Type* d_in, \ - const uint8_t* d_bitmask, \ - uint64_t bit_offset, \ - Type* d_out, int64_t* d_num_selected, \ - int64_t num_items, cudaStream_t stream \ -) { \ - return filter_bitmask_impl(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \ -} +#define DEFINE_FILTER_BITMASK(suffix, Type) \ + extern "C" cudaError_t filter_bitmask_##suffix(void *d_temp, \ + size_t temp_bytes, \ + const Type *d_in, \ + const uint8_t *d_bitmask, \ + uint64_t bit_offset, \ + Type *d_out, \ + int64_t *d_num_selected, \ + int64_t num_items, \ + cudaStream_t stream) { \ + return filter_bitmask_impl(d_temp, \ + temp_bytes, \ + d_in, \ + d_bitmask, \ + bit_offset, \ + d_out, \ + d_num_selected, \ + num_items, \ + stream); \ + } DEFINE_FILTER_BITMASK(u8, uint8_t) DEFINE_FILTER_BITMASK(i8, int8_t) diff --git a/vortex-cuda/cub/kernels/filter.h b/vortex-cuda/cub/kernels/filter.h index 45458e02985..354b877dc17 100644 --- a/vortex-cuda/cub/kernels/filter.h +++ b/vortex-cuda/cub/kernels/filter.h @@ -17,47 +17,45 @@ typedef struct { // CUDA types - defined as opaque for bindgen typedef int cudaError_t; -typedef void* cudaStream_t; +typedef void *cudaStream_t; #ifdef __cplusplus extern "C" { #endif // X-macro table: (suffix, c_type) -#define FILTER_TYPE_TABLE(X) \ - X(u8, uint8_t) \ - X(i8, int8_t) \ - X(u16, uint16_t) \ - X(i16, int16_t) \ - X(u32, uint32_t) \ - X(i32, int32_t) \ - X(u64, uint64_t) \ - X(i64, int64_t) \ - X(f32, float) \ - X(f64, double) \ - X(i128, __int128_t) \ +#define FILTER_TYPE_TABLE(X) \ + X(u8, uint8_t) \ + X(i8, int8_t) \ + X(u16, uint16_t) \ + X(i16, int16_t) \ + X(u32, uint32_t) \ + X(i32, int32_t) \ + X(u64, uint64_t) \ + X(i64, int64_t) \ + X(f32, float) \ + X(f64, double) \ + X(i128, __int128_t) \ X(i256, __int256_t) // Filter temp size query functions -#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \ - cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items); +#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \ + cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items); FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE) #undef DECLARE_FILTER_TEMP_SIZE // Filter execution functions (byte mask - one byte per element) -#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \ - cudaError_t filter_bytemask_##suffix( \ - void* d_temp, \ - size_t temp_bytes, \ - const c_type* d_in, \ - const uint8_t* d_flags, \ - c_type* d_out, \ - int64_t* d_num_selected, \ - int64_t num_items, \ - cudaStream_t stream \ - ); +#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \ + cudaError_t filter_bytemask_##suffix(void *d_temp, \ + size_t temp_bytes, \ + const c_type *d_in, \ + const uint8_t *d_flags, \ + c_type *d_out, \ + int64_t *d_num_selected, \ + int64_t num_items, \ + cudaStream_t stream); FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK) @@ -68,18 +66,16 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK) // These functions accept packed bit mask directly, avoiding the need to // expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator // to read bits on-the-fly during the filter operation. -#define DECLARE_FILTER_BITMASK(suffix, c_type) \ - cudaError_t filter_bitmask_##suffix( \ - void* d_temp, \ - size_t temp_bytes, \ - const c_type* d_in, \ - const uint8_t* d_bitmask, \ - uint64_t bit_offset, \ - c_type* d_out, \ - int64_t* d_num_selected, \ - int64_t num_items, \ - cudaStream_t stream \ - ); +#define DECLARE_FILTER_BITMASK(suffix, c_type) \ + cudaError_t filter_bitmask_##suffix(void *d_temp, \ + size_t temp_bytes, \ + const c_type *d_in, \ + const uint8_t *d_bitmask, \ + uint64_t bit_offset, \ + c_type *d_out, \ + int64_t *d_num_selected, \ + int64_t num_items, \ + cudaStream_t stream); FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK) diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu index d08f5fb48a9..e34285d9d9b 100644 --- a/vortex-cuda/kernels/src/alp.cu +++ b/vortex-cuda/kernels/src/alp.cu @@ -7,10 +7,10 @@ // Converts integers to floats by multiplying by precomputed exponent factors. // Formula: decoded = (float)encoded * f * e // Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly. -template +template struct AlpOp { - FloatT f; // F10[exponents.f] - power of 10 - FloatT e; // IF10[exponents.e] - inverse power of 10 + FloatT f; // F10[exponents.f] - power of 10 + FloatT e; // IF10[exponents.e] - inverse power of 10 __device__ inline FloatT operator()(EncodedT value) const { return static_cast(value) * f * e; @@ -19,16 +19,14 @@ struct AlpOp { // Macro to generate ALP kernel for each type combination. // Input is integer (encoded), output is float (decoded). -#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \ -extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \ - const EncType *__restrict encoded, \ - FloatType *__restrict decoded, \ - FloatType f, \ - FloatType e, \ - uint64_t array_len \ -) { \ - scalar_kernel(encoded, decoded, array_len, AlpOp{f, e}); \ -} +#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \ + extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \ + FloatType *__restrict decoded, \ + FloatType f, \ + FloatType e, \ + uint64_t array_len) { \ + scalar_kernel(encoded, decoded, array_len, AlpOp {f, e}); \ + } // f32 variants (ALP for f32 encodes as i32 or i64) GENERATE_ALP_KERNEL(i32, f32, int32_t, float) diff --git a/vortex-cuda/kernels/src/bit_unpack.cuh b/vortex-cuda/kernels/src/bit_unpack.cuh index 3871ad6516a..c69be63a190 100644 --- a/vortex-cuda/kernels/src/bit_unpack.cuh +++ b/vortex-cuda/kernels/src/bit_unpack.cuh @@ -25,17 +25,20 @@ /// * `lane` - Lane index within the block (used to determine which packed words to process) /// * `bit_width` - Number of bits with which each value is encoded template -__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk, T *__restrict output_buffer, - unsigned int lane, uint32_t bit_width); +__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk, + T *__restrict output_buffer, + unsigned int lane, + uint32_t bit_width); /// Template specializations for `bitunpack_lane_to_smem` for different integer types. /// /// Generates template specializations for each supported integer size (8, 16, 32, 64 bits). -#define BIT_UNPACK_LANE(bits) \ +#define BIT_UNPACK_LANE(bits) \ template <> \ - __device__ inline void bit_unpack_lane(const uint##bits##_t *in, \ - uint##bits##_t *out, \ - unsigned int lane, uint32_t bw) { \ + __device__ inline void bit_unpack_lane(const uint##bits##_t *in, \ + uint##bits##_t *out, \ + unsigned int lane, \ + uint32_t bw) { \ bit_unpack_##bits##_lane(in, out, lane, bw); \ } diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh index cf7fba2e848..3b3cc2f73cf 100644 --- a/vortex-cuda/kernels/src/config.cuh +++ b/vortex-cuda/kernels/src/config.cuh @@ -16,4 +16,4 @@ constexpr uint32_t ELEMENTS_PER_THREAD = 32; #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len)) -#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len)) +#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len)) diff --git a/vortex-cuda/kernels/src/config_check.cu b/vortex-cuda/kernels/src/config_check.cu index c3d7db0fe41..ceb0a67977a 100644 --- a/vortex-cuda/kernels/src/config_check.cu +++ b/vortex-cuda/kernels/src/config_check.cu @@ -13,7 +13,7 @@ // Kernel that outputs the config values for verification. // Output buffer layout: [elements_per_thread, block_dim_x, elements_per_block] -extern "C" __global__ void config_check(uint32_t* output) { +extern "C" __global__ void config_check(uint32_t *output) { if (threadIdx.x == 0 && blockIdx.x == 0) { output[0] = ELEMENTS_PER_THREAD; output[1] = blockDim.x; diff --git a/vortex-cuda/kernels/src/constant_numeric.cu b/vortex-cuda/kernels/src/constant_numeric.cu index 197fae81182..7a6d94d9e5c 100644 --- a/vortex-cuda/kernels/src/constant_numeric.cu +++ b/vortex-cuda/kernels/src/constant_numeric.cu @@ -6,12 +6,8 @@ #include // Fill an output buffer with a constant value. -template -__device__ void constant_fill( - T *__restrict output, - T value, - uint64_t array_len -) { +template +__device__ void constant_fill(T *__restrict output, T value, uint64_t array_len) { const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; const uint64_t startElem = START_ELEM(worker, array_len); const uint64_t stopElem = STOP_ELEM(worker, array_len); @@ -25,13 +21,11 @@ __device__ void constant_fill( } } -#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \ -extern "C" __global__ void constant_numeric_##suffix( \ - Type *__restrict output, \ - Type value, \ - uint64_t array_len \ -) { \ - constant_fill(output, value, array_len); \ -} +#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \ + extern "C" __global__ void constant_numeric_##suffix(Type *__restrict output, \ + Type value, \ + uint64_t array_len) { \ + constant_fill(output, value, array_len); \ + } FOR_EACH_NUMERIC(GENERATE_CONSTANT_NUMERIC_KERNEL) diff --git a/vortex-cuda/kernels/src/date_time_parts.cu b/vortex-cuda/kernels/src/date_time_parts.cu index e5ed17e831d..ccb3e614991 100644 --- a/vortex-cuda/kernels/src/date_time_parts.cu +++ b/vortex-cuda/kernels/src/date_time_parts.cu @@ -7,15 +7,13 @@ constexpr int64_t SECONDS_PER_DAY = 86400; // Combines date/time parts (days, seconds, subseconds) into timestamp values. -template -__device__ void date_time_parts( - const DaysT *__restrict days, - const SecondsT *__restrict seconds, - const SubsecondsT *__restrict subseconds, - int64_t divisor, - int64_t *__restrict output, - uint64_t array_len -) { +template +__device__ void date_time_parts(const DaysT *__restrict days, + const SecondsT *__restrict seconds, + const SubsecondsT *__restrict subseconds, + int64_t divisor, + int64_t *__restrict output, + uint64_t array_len) { const int64_t ticks_per_day = SECONDS_PER_DAY * divisor; const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD; @@ -23,40 +21,43 @@ __device__ void date_time_parts( const uint64_t block_end = min(block_start + elements_per_block, array_len); for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) { - output[idx] = static_cast(days[idx]) * ticks_per_day - + static_cast(seconds[idx]) * divisor - + static_cast(subseconds[idx]); + output[idx] = static_cast(days[idx]) * ticks_per_day + + static_cast(seconds[idx]) * divisor + static_cast(subseconds[idx]); } } -#define GENERATE_DATE_TIME_PARTS_KERNEL(days_suffix, DaysT, seconds_suffix, SecondsT, subseconds_suffix, SubsecondsT) \ -extern "C" __global__ void date_time_parts_##days_suffix##_##seconds_suffix##_##subseconds_suffix( \ - const DaysT *__restrict days, \ - const SecondsT *__restrict seconds, \ - const SubsecondsT *__restrict subseconds, \ - int64_t divisor, \ - int64_t *__restrict output, \ - uint64_t array_len \ -) { \ - date_time_parts(days, seconds, subseconds, divisor, output, array_len); \ -} +#define GENERATE_DATE_TIME_PARTS_KERNEL(days_suffix, \ + DaysT, \ + seconds_suffix, \ + SecondsT, \ + subseconds_suffix, \ + SubsecondsT) \ + extern "C" __global__ void date_time_parts_##days_suffix##_##seconds_suffix##_##subseconds_suffix( \ + const DaysT *__restrict days, \ + const SecondsT *__restrict seconds, \ + const SubsecondsT *__restrict subseconds, \ + int64_t divisor, \ + int64_t *__restrict output, \ + uint64_t array_len) { \ + date_time_parts(days, seconds, subseconds, divisor, output, array_len); \ + } -#define EXPAND_DAYS(X) \ - X(i8, int8_t) \ - X(i16, int16_t) \ - X(i32, int32_t) \ +#define EXPAND_DAYS(X) \ + X(i8, int8_t) \ + X(i16, int16_t) \ + X(i32, int32_t) \ X(i64, int64_t) -#define EXPAND_SUBSECONDS(d, DT, s, ST) \ - GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i8, int8_t) \ - GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i16, int16_t) \ - GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i32, int32_t) \ +#define EXPAND_SUBSECONDS(d, DT, s, ST) \ + GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i8, int8_t) \ + GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i16, int16_t) \ + GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i32, int32_t) \ GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i64, int64_t) -#define EXPAND_SECONDS(d, DT) \ - EXPAND_SUBSECONDS(d, DT, i8, int8_t) \ - EXPAND_SUBSECONDS(d, DT, i16, int16_t) \ - EXPAND_SUBSECONDS(d, DT, i32, int32_t) \ +#define EXPAND_SECONDS(d, DT) \ + EXPAND_SUBSECONDS(d, DT, i8, int8_t) \ + EXPAND_SUBSECONDS(d, DT, i16, int16_t) \ + EXPAND_SUBSECONDS(d, DT, i32, int32_t) \ EXPAND_SUBSECONDS(d, DT, i64, int64_t) // Generate all 64 kernels (4³) diff --git a/vortex-cuda/kernels/src/dict.cu b/vortex-cuda/kernels/src/dict.cu index d9ba7d36b30..3422980c65e 100644 --- a/vortex-cuda/kernels/src/dict.cu +++ b/vortex-cuda/kernels/src/dict.cu @@ -8,18 +8,15 @@ #include "config.cuh" #include "types.cuh" -template -__device__ void dict_kernel( - const IndexT *const __restrict codes, - uint64_t codes_len, - const ValueT *const __restrict values, - ValueT *const __restrict output -) { +template +__device__ void dict_kernel(const IndexT *const __restrict codes, + uint64_t codes_len, + const ValueT *const __restrict values, + ValueT *const __restrict output) { const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD; const uint64_t block_start = static_cast(blockIdx.x) * elements_per_block; - const uint64_t block_end = (block_start + elements_per_block < codes_len) - ? (block_start + elements_per_block) - : codes_len; + const uint64_t block_end = + (block_start + elements_per_block < codes_len) ? (block_start + elements_per_block) : codes_len; for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) { IndexT code = codes[idx]; @@ -28,21 +25,20 @@ __device__ void dict_kernel( } // Macro to generate dict kernels for all value/index type combinations -#define GENERATE_DICT_KERNEL(value_suffix, ValueType, index_suffix, IndexType) \ -extern "C" __global__ void dict_##value_suffix##_##index_suffix( \ - const IndexType *const __restrict codes, \ - uint64_t codes_len, \ - const ValueType *const __restrict values, \ - ValueType *const __restrict output \ -) { \ - dict_kernel(codes, codes_len, values, output); \ -} +#define GENERATE_DICT_KERNEL(value_suffix, ValueType, index_suffix, IndexType) \ + extern "C" __global__ void dict_##value_suffix##_##index_suffix( \ + const IndexType *const __restrict codes, \ + uint64_t codes_len, \ + const ValueType *const __restrict values, \ + ValueType *const __restrict output) { \ + dict_kernel(codes, codes_len, values, output); \ + } // Generate dict kernel for all index types (unsigned integers) for a given value type -#define GENERATE_DICT_FOR_ALL_INDICES(value_suffix, ValueType) \ - GENERATE_DICT_KERNEL(value_suffix, ValueType, u8, uint8_t) \ - GENERATE_DICT_KERNEL(value_suffix, ValueType, u16, uint16_t) \ - GENERATE_DICT_KERNEL(value_suffix, ValueType, u32, uint32_t) \ +#define GENERATE_DICT_FOR_ALL_INDICES(value_suffix, ValueType) \ + GENERATE_DICT_KERNEL(value_suffix, ValueType, u8, uint8_t) \ + GENERATE_DICT_KERNEL(value_suffix, ValueType, u16, uint16_t) \ + GENERATE_DICT_KERNEL(value_suffix, ValueType, u32, uint32_t) \ GENERATE_DICT_KERNEL(value_suffix, ValueType, u64, uint64_t) // Generate for all native ptypes & decimal values diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu index aa1390d05f9..8af4d8825f5 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.cu +++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu @@ -30,9 +30,11 @@ constexpr uint32_t ELEMENTS_PER_BLOCK = 2048; /// * `chunk_len` - Number of elements in this chunk (may be less than ELEMENTS_PER_BLOCK for tail blocks) /// * `source_op` - The source operation descriptor containing the operation type and parameters template -__device__ inline void dynamic_source_op(const T *__restrict input, T *__restrict smem, - uint64_t chunk_start, uint32_t chunk_len, - const struct SourceOp &source_op) { +__device__ inline void dynamic_source_op(const T *__restrict input, + T *__restrict smem, + uint64_t chunk_start, + uint32_t chunk_len, + const struct SourceOp &source_op) { constexpr uint32_t T_BITS = sizeof(T) * 8; constexpr uint32_t FL_LANES = ELEMENTS_PER_BLOCK / T_BITS; @@ -47,7 +49,9 @@ __device__ inline void dynamic_source_op(const T *__restrict input, T *__restric // FL blocks must divide evenly. Otherwise, the last unpack would overflow `smem`. static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0); - const auto div_ceil = [](auto a, auto b) { return (a + b - 1) / b; }; + const auto div_ceil = [](auto a, auto b) { + return (a + b - 1) / b; + }; const uint32_t num_fl_chunks = div_ceil(chunk_len, FL_CHUNK_SIZE); for (uint32_t chunk_idx = 0; chunk_idx < num_fl_chunks; ++chunk_idx) { @@ -60,7 +64,8 @@ __device__ inline void dynamic_source_op(const T *__restrict input, T *__restric } break; } - default: __builtin_unreachable(); + default: + __builtin_unreachable(); } } @@ -90,7 +95,8 @@ __device__ inline T dynamic_scalar_op(T value, const struct ScalarOp &op) { float result = static_cast(static_cast(value)) * op.params.alp.f * op.params.alp.e; return static_cast(__float_as_uint(result)); } - default: __builtin_unreachable(); + default: + __builtin_unreachable(); } } @@ -107,7 +113,9 @@ __device__ inline T dynamic_scalar_op(T value, const struct ScalarOp &op) { /// * `array_len` - Total number of elements /// * `plan` - Operation sequence to apply template -__device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict output, uint64_t array_len, +__device__ void dynamic_dispatch_impl(const T *__restrict input, + T *__restrict output, + uint64_t array_len, const struct DynamicDispatchPlan *__restrict plan) { constexpr uint32_t VALUES_PER_LOOP = 32 / sizeof(T); @@ -115,7 +123,8 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o __shared__ T smem_values[ELEMENTS_PER_BLOCK]; // Cache the plan in shared memory. - if (threadIdx.x == 0) smem_plan = *plan; + if (threadIdx.x == 0) + smem_plan = *plan; __syncthreads(); const uint64_t block_start = static_cast(blockIdx.x) * ELEMENTS_PER_BLOCK; @@ -136,7 +145,9 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o // parallelism. T values[VALUES_PER_LOOP]; + // clang-format off #pragma unroll + // clang-format on for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) { values[idx] = smem_values[tile_base + idx * blockDim.x + threadIdx.x]; } @@ -144,13 +155,17 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) { const struct ScalarOp &scalar_op = smem_plan.scalar_ops[op_idx]; + // clang-format off #pragma unroll + // clang-format on for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) { values[idx] = dynamic_scalar_op(values[idx], scalar_op); } } + // clang-format off #pragma unroll + // clang-format on for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) { output[block_start + tile_base + idx * blockDim.x + threadIdx.x] = values[idx]; } @@ -171,9 +186,11 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o /// /// Creates a CUDA kernel entry point by instantiating `dynamic_dispatch_impl` for the given type. #define GENERATE_DYNAMIC_DISPATCH_KERNEL(suffix, Type) \ - extern "C" __global__ void dynamic_dispatch_##suffix(const Type *__restrict input, \ - Type *__restrict output, uint64_t array_len, \ - const struct DynamicDispatchPlan *__restrict plan) { \ + extern "C" __global__ void dynamic_dispatch_##suffix( \ + const Type *__restrict input, \ + Type *__restrict output, \ + uint64_t array_len, \ + const struct DynamicDispatchPlan *__restrict plan) { \ dynamic_dispatch_impl(input, output, array_len, plan); \ } diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h index 1139e6e0370..f2ba1247b19 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.h +++ b/vortex-cuda/kernels/src/dynamic_dispatch.h @@ -13,7 +13,9 @@ extern "C" { /// Source ops: Fills shared memory from input (exactly one, required) union SourceParams { - struct BitunpackParams { uint8_t bit_width; } bitunpack; + struct BitunpackParams { + uint8_t bit_width; + } bitunpack; }; struct SourceOp { @@ -23,8 +25,13 @@ struct SourceOp { /// Scalar ops: Element-wise transforms in registers (0 or more) union ScalarParams { - struct FoRParams { uint64_t reference; } frame_of_ref; - struct AlpParams { float f; float e; } alp; + struct FoRParams { + uint64_t reference; + } frame_of_ref; + struct AlpParams { + float f; + float e; + } alp; }; struct ScalarOp { @@ -37,7 +44,7 @@ struct ScalarOp { struct DynamicDispatchPlan { struct SourceOp source; - uint8_t num_scalar_ops; + uint8_t num_scalar_ops; struct ScalarOp scalar_ops[MAX_SCALAR_OPS]; }; diff --git a/vortex-cuda/kernels/src/for.cu b/vortex-cuda/kernels/src/for.cu index 653f527b6b9..702a0bfdf7b 100644 --- a/vortex-cuda/kernels/src/for.cu +++ b/vortex-cuda/kernels/src/for.cu @@ -5,7 +5,7 @@ #include "types.cuh" // Frame-of-Reference operation: adds a reference value to each element. -template +template struct ForOp { T reference; @@ -15,25 +15,19 @@ struct ForOp { }; // Macro to generate in-place FoR kernel for each type. -#define GENERATE_FOR_KERNEL(suffix, Type) \ -extern "C" __global__ void for_##suffix( \ - Type *__restrict values, \ - Type reference, \ - uint64_t array_len \ -) { \ - scalar_kernel_inplace(values, array_len, ForOp{reference}); \ -} +#define GENERATE_FOR_KERNEL(suffix, Type) \ + extern "C" __global__ void for_##suffix(Type *__restrict values, Type reference, uint64_t array_len) { \ + scalar_kernel_inplace(values, array_len, ForOp {reference}); \ + } // Macro to generate FoR kernel with separate input/output buffers. -#define GENERATE_FOR_IN_OUT_KERNEL(suffix, Type) \ -extern "C" __global__ void for_in_out_##suffix( \ - const Type *__restrict input, \ - Type *__restrict output, \ - Type reference, \ - uint64_t array_len \ -) { \ - scalar_kernel(input, output, array_len, ForOp{reference}); \ -} +#define GENERATE_FOR_IN_OUT_KERNEL(suffix, Type) \ + extern "C" __global__ void for_in_out_##suffix(const Type *__restrict input, \ + Type *__restrict output, \ + Type reference, \ + uint64_t array_len) { \ + scalar_kernel(input, output, array_len, ForOp {reference}); \ + } // In-place variants (modifies input buffer) - FoR is only used for integers FOR_EACH_INTEGER(GENERATE_FOR_KERNEL) diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu index 00945e50e63..8a87fcd67e7 100644 --- a/vortex-cuda/kernels/src/patches.cu +++ b/vortex-cuda/kernels/src/patches.cu @@ -7,13 +7,11 @@ // TODO(aduffy): this is very naive. In the future we need to // transpose the patches, see G-ALP paper. // Apply patches to a source array -template -__device__ void patches( - ValueT *const values, - const IndexT *const patchIndices, - const ValueT *const patchValues, - uint64_t patchesLen -) { +template +__device__ void patches(ValueT *const values, + const IndexT *const patchIndices, + const ValueT *const patchValues, + uint64_t patchesLen) { const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; const uint64_t startElem = START_ELEM(worker, patchesLen); const uint64_t stopElem = STOP_ELEM(worker, patchesLen); @@ -31,21 +29,19 @@ __device__ void patches( } } -#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \ -extern "C" __global__ void patches_##value_suffix##_##index_suffix( \ - ValueT *const values, \ - const IndexT *const patchIndices, \ - const ValueT *const patchValues, \ - uint64_t patchesLen \ -) { \ - patches(values, patchIndices, patchValues, patchesLen); \ -} +#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \ + extern "C" __global__ void patches_##value_suffix##_##index_suffix(ValueT *const values, \ + const IndexT *const patchIndices, \ + const ValueT *const patchValues, \ + uint64_t patchesLen) { \ + patches(values, patchIndices, patchValues, patchesLen); \ + } // Generate patches kernel for all index types (unsigned integers) for a given value type -#define GENERATE_PATCHES_FOR_ALL_INDICES(value_suffix, ValueT) \ - GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \ - GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \ - GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \ +#define GENERATE_PATCHES_FOR_ALL_INDICES(value_suffix, ValueT) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \ GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64) // Generate for all native SIMD ptypes diff --git a/vortex-cuda/kernels/src/runend.cu b/vortex-cuda/kernels/src/runend.cu index 5cac5314c0a..dda94e87a9e 100644 --- a/vortex-cuda/kernels/src/runend.cu +++ b/vortex-cuda/kernels/src/runend.cu @@ -19,18 +19,18 @@ constexpr uint32_t MAX_CACHED_RUNS = 512; /// is chosen as the binary search runs on a single GPU thread. This is /// preferred over `thrust::device` as this would spawn an additional kernel /// launch. -/// See: https://nvidia.github.io/cccl/thrust/api/group__binary__search_1gac85cc9ea00f4bdd8f80ad25fff16741d.html#thrust-upper-bound +/// See: +/// https://nvidia.github.io/cccl/thrust/api/group__binary__search_1gac85cc9ea00f4bdd8f80ad25fff16741d.html#thrust-upper-bound /// /// Returns the index of the first element that is greater than `value`, or /// `len` if no such element exists. -template +template __device__ inline uint64_t upper_bound(const T *data, uint64_t len, uint64_t value) { auto it = thrust::upper_bound(thrust::seq, data, data + len, value); return it - data; } - // Decodes run-end encoded data on the GPU. // // Run-end stores data as pairs of (value, end_position) where each run contains @@ -58,15 +58,13 @@ __device__ inline uint64_t upper_bound(const T *data, uint64_t len, uint64_t val // fall back to binary search in global memory for each element. // // TODO(0ax1): Investigate whether there are faster solutions. -template -__device__ void runend_decode_kernel( - const EndsT *const __restrict ends, - uint64_t num_runs, - const ValueT *const __restrict values, - uint64_t offset, - uint64_t output_len, - ValueT *const __restrict output -) { +template +__device__ void runend_decode_kernel(const EndsT *const __restrict ends, + uint64_t num_runs, + const ValueT *const __restrict values, + uint64_t offset, + uint64_t output_len, + ValueT *const __restrict output) { __shared__ EndsT shared_ends[MAX_CACHED_RUNS]; __shared__ ValueT shared_values[MAX_CACHED_RUNS]; __shared__ uint64_t block_first_run; @@ -76,7 +74,8 @@ __device__ void runend_decode_kernel( const uint64_t block_start = static_cast(blockIdx.x) * elements_per_block; const uint64_t block_end = min(block_start + elements_per_block, output_len); - if (block_start >= output_len) return; + if (block_start >= output_len) + return; // Thread 0 finds the run range for this block. if (threadIdx.x == 0) { @@ -87,7 +86,8 @@ __device__ void runend_decode_kernel( uint64_t last_run = upper_bound(ends, num_runs, last_pos); block_first_run = first_run; - block_num_runs = static_cast(min(last_run - first_run + 1, static_cast(MAX_CACHED_RUNS))); + block_num_runs = + static_cast(min(last_run - first_run + 1, static_cast(MAX_CACHED_RUNS))); } __syncthreads(); @@ -118,28 +118,28 @@ __device__ void runend_decode_kernel( for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) { uint64_t pos = idx + offset; uint64_t run_idx = upper_bound(ends, num_runs, pos); - if (run_idx >= num_runs) run_idx = num_runs - 1; + if (run_idx >= num_runs) + run_idx = num_runs - 1; output[idx] = values[run_idx]; } } } -#define GENERATE_RUNEND_KERNEL(value_suffix, ValueType, ends_suffix, EndsType) \ -extern "C" __global__ void runend_##value_suffix##_##ends_suffix( \ - const EndsType *const __restrict ends, \ - uint64_t num_runs, \ - const ValueType *const __restrict values, \ - uint64_t offset, \ - uint64_t output_len, \ - ValueType *const __restrict output \ -) { \ - runend_decode_kernel(ends, num_runs, values, offset, output_len, output); \ -} +#define GENERATE_RUNEND_KERNEL(value_suffix, ValueType, ends_suffix, EndsType) \ + extern "C" __global__ void runend_##value_suffix##_##ends_suffix( \ + const EndsType *const __restrict ends, \ + uint64_t num_runs, \ + const ValueType *const __restrict values, \ + uint64_t offset, \ + uint64_t output_len, \ + ValueType *const __restrict output) { \ + runend_decode_kernel(ends, num_runs, values, offset, output_len, output); \ + } -#define GENERATE_RUNEND_KERNELS_FOR_VALUE(value_suffix, ValueType) \ - GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u8, uint8_t) \ - GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u16, uint16_t) \ - GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u32, uint32_t) \ +#define GENERATE_RUNEND_KERNELS_FOR_VALUE(value_suffix, ValueType) \ + GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u8, uint8_t) \ + GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u16, uint16_t) \ + GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u32, uint32_t) \ GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u64, uint64_t) GENERATE_RUNEND_KERNELS_FOR_VALUE(u8, uint8_t) diff --git a/vortex-cuda/kernels/src/scalar_kernel.cuh b/vortex-cuda/kernels/src/scalar_kernel.cuh index 93564fde3d1..530b4b14dff 100644 --- a/vortex-cuda/kernels/src/scalar_kernel.cuh +++ b/vortex-cuda/kernels/src/scalar_kernel.cuh @@ -14,18 +14,13 @@ // Launch config: grid_dim = (array_len / 2048, 1, 1), block_dim = (64, 1, 1) // Each block handles 2048 elements, 64 threads per block. // Vectorized to process 16 bytes per iteration for better memory throughput. -template -__device__ void scalar_kernel( - const InputT *__restrict in, - OutputT *__restrict out, - uint64_t array_len, - Op op -) { +template +__device__ void +scalar_kernel(const InputT *__restrict in, OutputT *__restrict out, uint64_t array_len, Op op) { const uint32_t elements_per_block = 2048; const uint64_t block_start = static_cast(blockIdx.x) * elements_per_block; - const uint64_t block_end = (block_start + elements_per_block < array_len) - ? (block_start + elements_per_block) - : array_len; + const uint64_t block_end = + (block_start + elements_per_block < array_len) ? (block_start + elements_per_block) : array_len; // Vectorized loop - process 16 bytes per iteration for better memory throughput. constexpr auto VALUES_PER_LOOP = 16 / sizeof(InputT); @@ -36,7 +31,9 @@ __device__ void scalar_kernel( uint64_t base_idx = idx * VALUES_PER_LOOP; // The loop can be unrolled, as `VALUES_PER_LOOP` is `constexpr`. + // clang-format off #pragma unroll + // clang-format on for (uint64_t i = 0; i < VALUES_PER_LOOP; ++i) { out[base_idx + i] = op(in[base_idx + i]); } @@ -50,11 +47,7 @@ __device__ void scalar_kernel( } // In-place variant (same input/output buffer, same type). -template -__device__ void scalar_kernel_inplace( - T *__restrict values, - uint64_t array_len, - Op op -) { +template +__device__ void scalar_kernel_inplace(T *__restrict values, uint64_t array_len, Op op) { scalar_kernel(values, values, array_len, op); } diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu index eb9df20b46e..d616b6a908e 100644 --- a/vortex-cuda/kernels/src/sequence.cu +++ b/vortex-cuda/kernels/src/sequence.cu @@ -3,13 +3,8 @@ #include "config.cuh" -template -__device__ void sequence( - ValueT *const output, - ValueT base, - ValueT multiplier, - uint64_t len -) { +template +__device__ void sequence(ValueT *const output, ValueT base, ValueT multiplier, uint64_t len) { const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; const uint64_t startElem = START_ELEM(worker, len); @@ -20,15 +15,13 @@ __device__ void sequence( } } -#define GENERATE_KERNEL(ValueT, suffix) \ -extern "C" __global__ void sequence_##suffix( \ - ValueT *const output, \ - ValueT base, \ - ValueT multiplier, \ - uint64_t len \ -) { \ - sequence(output, base, multiplier, len); \ -} +#define GENERATE_KERNEL(ValueT, suffix) \ + extern "C" __global__ void sequence_##suffix(ValueT *const output, \ + ValueT base, \ + ValueT multiplier, \ + uint64_t len) { \ + sequence(output, base, multiplier, len); \ + } GENERATE_KERNEL(uint8_t, u8); GENERATE_KERNEL(uint16_t, u16); diff --git a/vortex-cuda/kernels/src/types.cuh b/vortex-cuda/kernels/src/types.cuh index 54d14aedc3d..519a89f4630 100644 --- a/vortex-cuda/kernels/src/types.cuh +++ b/vortex-cuda/kernels/src/types.cuh @@ -21,48 +21,48 @@ struct __align__(32) int256_t { // These mirror the Rust match_each_*_ptype macros. // Unsigned integers -#define FOR_EACH_UNSIGNED_INT(MACRO) \ - MACRO(u8, uint8_t) \ - MACRO(u16, uint16_t) \ - MACRO(u32, uint32_t) \ +#define FOR_EACH_UNSIGNED_INT(MACRO) \ + MACRO(u8, uint8_t) \ + MACRO(u16, uint16_t) \ + MACRO(u32, uint32_t) \ MACRO(u64, uint64_t) // Signed integers -#define FOR_EACH_SIGNED_INT(MACRO) \ - MACRO(i8, int8_t) \ - MACRO(i16, int16_t) \ - MACRO(i32, int32_t) \ +#define FOR_EACH_SIGNED_INT(MACRO) \ + MACRO(i8, int8_t) \ + MACRO(i16, int16_t) \ + MACRO(i32, int32_t) \ MACRO(i64, int64_t) // All integers (signed + unsigned) -#define FOR_EACH_INTEGER(MACRO) \ - FOR_EACH_UNSIGNED_INT(MACRO) \ +#define FOR_EACH_INTEGER(MACRO) \ + FOR_EACH_UNSIGNED_INT(MACRO) \ FOR_EACH_SIGNED_INT(MACRO) // All floating point types (requires #include ) -#define FOR_EACH_FLOAT(MACRO) \ - MACRO(f16, __half) \ - MACRO(f32, float) \ +#define FOR_EACH_FLOAT(MACRO) \ + MACRO(f16, __half) \ + MACRO(f32, float) \ MACRO(f64, double) // Native SIMD types (integers + f32/f64, matches match_each_native_simd_ptype) -#define FOR_EACH_NATIVE_SIMD_PTYPE(MACRO) \ - FOR_EACH_INTEGER(MACRO) \ - MACRO(f32, float) \ +#define FOR_EACH_NATIVE_SIMD_PTYPE(MACRO) \ + FOR_EACH_INTEGER(MACRO) \ + MACRO(f32, float) \ MACRO(f64, double) // All native ptypes (requires #include , matches match_each_native_ptype) -#define FOR_EACH_NATIVE_PTYPE(MACRO) \ - FOR_EACH_INTEGER(MACRO) \ +#define FOR_EACH_NATIVE_PTYPE(MACRO) \ + FOR_EACH_INTEGER(MACRO) \ FOR_EACH_FLOAT(MACRO) // Large decimal types (128-bit and 256-bit integers for decimal representation). // Use alongside FOR_EACH_NATIVE_PTYPE for full type coverage. -#define FOR_EACH_LARGE_DECIMAL(MACRO) \ - MACRO(i128, int128_t) \ +#define FOR_EACH_LARGE_DECIMAL(MACRO) \ + MACRO(i128, int128_t) \ MACRO(i256, int256_t) // All numeric types: native ptypes + large decimals (requires #include ) -#define FOR_EACH_NUMERIC(MACRO) \ - FOR_EACH_NATIVE_PTYPE(MACRO) \ +#define FOR_EACH_NUMERIC(MACRO) \ + FOR_EACH_NATIVE_PTYPE(MACRO) \ FOR_EACH_LARGE_DECIMAL(MACRO) diff --git a/vortex-cuda/kernels/src/varbinview.cuh b/vortex-cuda/kernels/src/varbinview.cuh index 41a97a864a6..d1dd67a710f 100644 --- a/vortex-cuda/kernels/src/varbinview.cuh +++ b/vortex-cuda/kernels/src/varbinview.cuh @@ -9,10 +9,10 @@ constexpr int32_t MAX_INLINED_SIZE = 12; // a byte buffer holding string data -typedef uint8_t* Buffer; +typedef uint8_t *Buffer; // an i32 offsets buffer -typedef int32_t* Offsets; +typedef int32_t *Offsets; struct InlinedBinaryView { int32_t size; diff --git a/vortex-cuda/kernels/src/varbinview_compute_offsets.cu b/vortex-cuda/kernels/src/varbinview_compute_offsets.cu index 600e0f6e5f0..7b7434fd313 100644 --- a/vortex-cuda/kernels/src/varbinview_compute_offsets.cu +++ b/vortex-cuda/kernels/src/varbinview_compute_offsets.cu @@ -5,12 +5,10 @@ #include "varbinview.cuh" // single-threaded, compute offsets -extern "C" __global__ void varbinview_compute_offsets( - const BinaryView *views, - int64_t num_strings, - Offsets out_offsets, - int32_t *last_offset -) { +extern "C" __global__ void varbinview_compute_offsets(const BinaryView *views, + int64_t num_strings, + Offsets out_offsets, + int32_t *last_offset) { const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; // force execution to be single-threaded to compute the prefix diff --git a/vortex-cuda/kernels/src/varbinview_copy_strings.cu b/vortex-cuda/kernels/src/varbinview_copy_strings.cu index ae11fa6f1ab..f843c9f3b3c 100644 --- a/vortex-cuda/kernels/src/varbinview_copy_strings.cu +++ b/vortex-cuda/kernels/src/varbinview_copy_strings.cu @@ -4,13 +4,8 @@ #include "config.cuh" #include "varbinview.cuh" -// Lookup a string from a binary view, copying it into -// a destination buffer. -__device__ void copy_string_to_dst( - BinaryView& view, - Buffer *buffers, - uint8_t *dst -) { +// Lookup a string from a binary view, copying it into a destination buffer. +__device__ void copy_string_to_dst(BinaryView &view, Buffer *buffers, uint8_t *dst) { int32_t size = view.inlined.size; uint8_t *src; if (size <= MAX_INLINED_SIZE) { @@ -23,13 +18,11 @@ __device__ void copy_string_to_dst( memcpy(dst, src, size); } -extern "C" __global__ void varbinview_copy_strings( - int64_t len, - BinaryView* views, - Buffer* buffers, - Buffer dst_buffer, - Offsets dst_offsets -) { +extern "C" __global__ void varbinview_copy_strings(int64_t len, + BinaryView *views, + Buffer *buffers, + Buffer dst_buffer, + Offsets dst_offsets) { const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; // Each thread is responsible for copying a single string. @@ -43,4 +36,4 @@ extern "C" __global__ void varbinview_copy_strings( uint8_t *dst = &dst_buffer[offset]; copy_string_to_dst(view, buffers, dst); -} \ No newline at end of file +} diff --git a/vortex-cuda/kernels/src/zigzag.cu b/vortex-cuda/kernels/src/zigzag.cu index 708e20a1be7..4059452da56 100644 --- a/vortex-cuda/kernels/src/zigzag.cu +++ b/vortex-cuda/kernels/src/zigzag.cu @@ -4,10 +4,11 @@ #include "scalar_kernel.cuh" // ZigZag decode operation. +// // Converts unsigned integers back to signed using the ZigZag encoding scheme. // Formula: decoded = (encoded >> 1) ^ -(encoded & 1) // This interleaves positive and negative numbers: 0, -1, 1, -2, 2, -3, ... -template +template struct ZigZagOp { __device__ inline SignedT operator()(UnsignedT value) const { // ZigZag decode: (n >> 1) ^ -(n & 1) @@ -18,14 +19,13 @@ struct ZigZagOp { // Macro to generate ZigZag kernel for each type. // In-place operation: unsigned input, signed output (same size, reinterpret). -#define GENERATE_ZIGZAG_KERNEL(suffix, UnsignedType, SignedType) \ -extern "C" __global__ void zigzag_##suffix( \ - UnsignedType *__restrict values, \ - uint64_t array_len \ -) { \ - scalar_kernel(values, reinterpret_cast(values), array_len, \ - ZigZagOp{}); \ -} +#define GENERATE_ZIGZAG_KERNEL(suffix, UnsignedType, SignedType) \ + extern "C" __global__ void zigzag_##suffix(UnsignedType *__restrict values, uint64_t array_len) { \ + scalar_kernel(values, \ + reinterpret_cast(values), \ + array_len, \ + ZigZagOp {}); \ + } GENERATE_ZIGZAG_KERNEL(u8, uint8_t, int8_t) GENERATE_ZIGZAG_KERNEL(u16, uint16_t, int16_t)