diff --git a/vortex-cuda/kernels/src/.clang-format b/vortex-cuda/.clang-format
similarity index 81%
rename from vortex-cuda/kernels/src/.clang-format
rename to vortex-cuda/.clang-format
index d6b5a383e19..6a25679451b 100644
--- a/vortex-cuda/kernels/src/.clang-format
+++ b/vortex-cuda/.clang-format
@@ -14,9 +14,11 @@ DerivePointerAlignment: false
 PointerAlignment: Right
 AlignConsecutiveMacros: true
 AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+BinPackArguments: false
+BinPackParameters: false
 AlignAfterOpenBracket: Align
 SpaceBeforeCpp11BracedList: true
 SpaceBeforeCtorInitializerColon: true
@@ -32,4 +34,3 @@ IncludeBlocks: Regroup
 Language: Cpp
 AccessModifierOffset: -4
 ---
-
diff --git a/vortex-cuda/cub/kernels/filter.cu b/vortex-cuda/cub/kernels/filter.cu
index ecf11b32b9f..42f55b24540 100644
--- a/vortex-cuda/cub/kernels/filter.cu
+++ b/vortex-cuda/cub/kernels/filter.cu
@@ -4,10 +4,10 @@
 // CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering.
 
 #include <cub/cub.cuh>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cuda_runtime.h>
 #include <stdint.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
 // i256 type
 typedef struct {
@@ -17,11 +17,10 @@ typedef struct {
 
 // Bit extraction functor for TransformInputIterator
 struct BitExtractor {
-    const uint8_t* packed;
+    const uint8_t *packed;
     uint64_t bit_offset;
 
-    __host__ __device__ inline
-    uint8_t operator()(int64_t idx) const {
+    __host__ __device__ inline uint8_t operator()(int64_t idx) const {
         uint64_t actual_bit = bit_offset + static_cast<uint64_t>(idx);
         uint64_t byte_idx = actual_bit / 8;
         uint32_t bit_idx = actual_bit % 8;
@@ -30,31 +29,27 @@ struct BitExtractor {
 };
 
 /// Type alias for the packed bit iterator.
-using PackedBitIterator = thrust::transform_iterator<
-    BitExtractor,
-    thrust::counting_iterator<int64_t>
->;
+using PackedBitIterator = thrust::transform_iterator<BitExtractor, thrust::counting_iterator<int64_t>>;
 
 // CUB DeviceSelect::Flagged - Query temp storage size
-template<typename T>
-static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) {
+template <typename T>
+static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
     size_t bytes = 0;
-    cudaError_t err = cub::DeviceSelect::Flagged(
-        nullptr, bytes,
-        static_cast<const T*>(nullptr),
-        static_cast<const uint8_t*>(nullptr),
-        static_cast<T*>(nullptr),
-        static_cast<int64_t*>(nullptr),
-        num_items
-    );
+    cudaError_t err = cub::DeviceSelect::Flagged(nullptr,
+                                                 bytes,
+                                                 static_cast<const T *>(nullptr),
+                                                 static_cast<const uint8_t *>(nullptr),
+                                                 static_cast<T *>(nullptr),
+                                                 static_cast<int64_t *>(nullptr),
+                                                 num_items);
     *temp_bytes = bytes;
     return err;
 }
 
-#define DEFINE_TEMP_SIZE(suffix, Type) \
-extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \
-    return filter_temp_size_impl<Type>(temp_bytes, n); \
-}
+#define DEFINE_TEMP_SIZE(suffix, Type)                                                                       \
+    extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) {                        \
+        return filter_temp_size_impl<Type>(temp_bytes, n);                                                   \
+    }
 
 DEFINE_TEMP_SIZE(u8, uint8_t)
 DEFINE_TEMP_SIZE(i8, int8_t)
@@ -70,33 +65,43 @@ DEFINE_TEMP_SIZE(i128, __int128_t)
 DEFINE_TEMP_SIZE(i256, __int256_t)
 
 // CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element)
-template<typename T>
-static cudaError_t filter_bytemask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_flags,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, d_flags, d_out, d_num_selected,
-        num_items, stream
-    );
+template <typename T>
+static cudaError_t filter_bytemask_impl(void *d_temp,
+                                        size_t temp_bytes,
+                                        const T *d_in,
+                                        const uint8_t *d_flags,
+                                        T *d_out,
+                                        int64_t *d_num_selected,
+                                        int64_t num_items,
+                                        cudaStream_t stream) {
+    return cub::DeviceSelect::Flagged(d_temp,
+                                      temp_bytes,
+                                      d_in,
+                                      d_flags,
+                                      d_out,
+                                      d_num_selected,
+                                      num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
-extern "C" cudaError_t filter_bytemask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, const uint8_t* d_flags, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BYTEMASK(suffix, Type)                                                                 \
+    extern "C" cudaError_t filter_bytemask_##suffix(void *d_temp,                                            \
+                                                    size_t temp_bytes,                                       \
+                                                    const Type *d_in,                                        \
+                                                    const uint8_t *d_flags,                                  \
+                                                    Type *d_out,                                             \
+                                                    int64_t *d_num_selected,                                 \
+                                                    int64_t num_items,                                       \
+                                                    cudaStream_t stream) {                                   \
+        return filter_bytemask_impl<Type>(d_temp,                                                            \
+                                          temp_bytes,                                                        \
+                                          d_in,                                                              \
+                                          d_flags,                                                           \
+                                          d_out,                                                             \
+                                          d_num_selected,                                                    \
+                                          num_items,                                                         \
+                                          stream);                                                           \
+    }
 
 DEFINE_FILTER_BYTEMASK(u8, uint8_t)
 DEFINE_FILTER_BYTEMASK(i8, int8_t)
@@ -125,41 +130,51 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t)
 //   d_num_selected: Output count of selected elements
 //   num_items: Number of input elements
 //   stream: CUDA stream
-template<typename T>
-static cudaError_t filter_bitmask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_bitmask,
-    uint64_t bit_offset,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
+template <typename T>
+static cudaError_t filter_bitmask_impl(void *d_temp,
+                                       size_t temp_bytes,
+                                       const T *d_in,
+                                       const uint8_t *d_bitmask,
+                                       uint64_t bit_offset,
+                                       T *d_out,
+                                       int64_t *d_num_selected,
+                                       int64_t num_items,
+                                       cudaStream_t stream) {
     // Create a transform iterator to read packed bits.
-    BitExtractor extractor{d_bitmask, bit_offset};
+    BitExtractor extractor {d_bitmask, bit_offset};
     thrust::counting_iterator<int64_t> counting_iter(0);
     PackedBitIterator flag_iter(counting_iter, extractor);
 
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, flag_iter, d_out, d_num_selected,
-        num_items, stream
-    );
+    return cub::DeviceSelect::Flagged(d_temp,
+                                      temp_bytes,
+                                      d_in,
+                                      flag_iter,
+                                      d_out,
+                                      d_num_selected,
+                                      num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BITMASK(suffix, Type) \
-extern "C" cudaError_t filter_bitmask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, \
-    const uint8_t* d_bitmask, \
-    uint64_t bit_offset, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BITMASK(suffix, Type)                                                                  \
+    extern "C" cudaError_t filter_bitmask_##suffix(void *d_temp,                                             \
+                                                   size_t temp_bytes,                                        \
+                                                   const Type *d_in,                                         \
+                                                   const uint8_t *d_bitmask,                                 \
+                                                   uint64_t bit_offset,                                      \
+                                                   Type *d_out,                                              \
+                                                   int64_t *d_num_selected,                                  \
+                                                   int64_t num_items,                                        \
+                                                   cudaStream_t stream) {                                    \
+        return filter_bitmask_impl<Type>(d_temp,                                                             \
+                                         temp_bytes,                                                         \
+                                         d_in,                                                               \
+                                         d_bitmask,                                                          \
+                                         bit_offset,                                                         \
+                                         d_out,                                                              \
+                                         d_num_selected,                                                     \
+                                         num_items,                                                          \
+                                         stream);                                                            \
+    }
 
 DEFINE_FILTER_BITMASK(u8, uint8_t)
 DEFINE_FILTER_BITMASK(i8, int8_t)
diff --git a/vortex-cuda/cub/kernels/filter.h b/vortex-cuda/cub/kernels/filter.h
index 45458e02985..354b877dc17 100644
--- a/vortex-cuda/cub/kernels/filter.h
+++ b/vortex-cuda/cub/kernels/filter.h
@@ -17,47 +17,45 @@ typedef struct {
 
 // CUDA types - defined as opaque for bindgen
 typedef int cudaError_t;
-typedef void* cudaStream_t;
+typedef void *cudaStream_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // X-macro table: (suffix, c_type)
-#define FILTER_TYPE_TABLE(X)  \
-    X(u8,  uint8_t)           \
-    X(i8,  int8_t)            \
-    X(u16, uint16_t)          \
-    X(i16, int16_t)           \
-    X(u32, uint32_t)          \
-    X(i32, int32_t)           \
-    X(u64, uint64_t)          \
-    X(i64, int64_t)           \
-    X(f32, float)             \
-    X(f64, double)            \
-    X(i128, __int128_t)       \
+#define FILTER_TYPE_TABLE(X)                                                                                 \
+    X(u8, uint8_t)                                                                                           \
+    X(i8, int8_t)                                                                                            \
+    X(u16, uint16_t)                                                                                         \
+    X(i16, int16_t)                                                                                          \
+    X(u32, uint32_t)                                                                                         \
+    X(i32, int32_t)                                                                                          \
+    X(u64, uint64_t)                                                                                         \
+    X(i64, int64_t)                                                                                          \
+    X(f32, float)                                                                                            \
+    X(f64, double)                                                                                           \
+    X(i128, __int128_t)                                                                                      \
     X(i256, __int256_t)
 
 // Filter temp size query functions
-#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
-    cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items);
+#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type)                                                             \
+    cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE)
 
 #undef DECLARE_FILTER_TEMP_SIZE
 
 // Filter execution functions (byte mask - one byte per element)
-#define DECLARE_FILTER_BYTEMASK(suffix, c_type)  \
-    cudaError_t filter_bytemask_##suffix(        \
-        void* d_temp,                            \
-        size_t temp_bytes,                       \
-        const c_type* d_in,                      \
-        const uint8_t* d_flags,                  \
-        c_type* d_out,                           \
-        int64_t* d_num_selected,                 \
-        int64_t num_items,                       \
-        cudaStream_t stream                      \
-    );
+#define DECLARE_FILTER_BYTEMASK(suffix, c_type)                                                              \
+    cudaError_t filter_bytemask_##suffix(void *d_temp,                                                       \
+                                         size_t temp_bytes,                                                  \
+                                         const c_type *d_in,                                                 \
+                                         const uint8_t *d_flags,                                             \
+                                         c_type *d_out,                                                      \
+                                         int64_t *d_num_selected,                                            \
+                                         int64_t num_items,                                                  \
+                                         cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 
@@ -68,18 +66,16 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 // These functions accept packed bit mask directly, avoiding the need to
 // expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator
 // to read bits on-the-fly during the filter operation.
-#define DECLARE_FILTER_BITMASK(suffix, c_type)  \
-    cudaError_t filter_bitmask_##suffix(        \
-        void* d_temp,                           \
-        size_t temp_bytes,                      \
-        const c_type* d_in,                     \
-        const uint8_t* d_bitmask,               \
-        uint64_t bit_offset,                    \
-        c_type* d_out,                          \
-        int64_t* d_num_selected,                \
-        int64_t num_items,                      \
-        cudaStream_t stream                     \
-    );
+#define DECLARE_FILTER_BITMASK(suffix, c_type)                                                               \
+    cudaError_t filter_bitmask_##suffix(void *d_temp,                                                        \
+                                        size_t temp_bytes,                                                   \
+                                        const c_type *d_in,                                                  \
+                                        const uint8_t *d_bitmask,                                            \
+                                        uint64_t bit_offset,                                                 \
+                                        c_type *d_out,                                                       \
+                                        int64_t *d_num_selected,                                             \
+                                        int64_t num_items,                                                   \
+                                        cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)
 
diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu
index d08f5fb48a9..e34285d9d9b 100644
--- a/vortex-cuda/kernels/src/alp.cu
+++ b/vortex-cuda/kernels/src/alp.cu
@@ -7,10 +7,10 @@
 // Converts integers to floats by multiplying by precomputed exponent factors.
 // Formula: decoded = (float)encoded * f * e
 // Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template<typename EncodedT, typename FloatT>
+template <typename EncodedT, typename FloatT>
 struct AlpOp {
-    FloatT f;  // F10[exponents.f] - power of 10
-    FloatT e;  // IF10[exponents.e] - inverse power of 10
+    FloatT f; // F10[exponents.f] - power of 10
+    FloatT e; // IF10[exponents.e] - inverse power of 10
 
     __device__ inline FloatT operator()(EncodedT value) const {
         return static_cast<FloatT>(value) * f * e;
@@ -19,16 +19,14 @@ struct AlpOp {
 
 // Macro to generate ALP kernel for each type combination.
 // Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
-extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \
-    const EncType *__restrict encoded, \
-    FloatType *__restrict decoded, \
-    FloatType f, \
-    FloatType e, \
-    uint64_t array_len \
-) { \
-    scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType>{f, e}); \
-}
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
+                                                                 FloatType *__restrict decoded,              \
+                                                                 FloatType f,                                \
+                                                                 FloatType e,                                \
+                                                                 uint64_t array_len) {                       \
+        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    }
 
 // f32 variants (ALP for f32 encodes as i32 or i64)
 GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
diff --git a/vortex-cuda/kernels/src/bit_unpack.cuh b/vortex-cuda/kernels/src/bit_unpack.cuh
index 3871ad6516a..c69be63a190 100644
--- a/vortex-cuda/kernels/src/bit_unpack.cuh
+++ b/vortex-cuda/kernels/src/bit_unpack.cuh
@@ -25,17 +25,20 @@
 /// * `lane` - Lane index within the block (used to determine which packed words to process)
 /// * `bit_width` - Number of bits with which each value is encoded
 template <typename T>
-__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk, T *__restrict output_buffer,
-                                                       unsigned int lane, uint32_t bit_width);
+__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk,
+                                       T *__restrict output_buffer,
+                                       unsigned int lane,
+                                       uint32_t bit_width);
 
 /// Template specializations for `bitunpack_lane_to_smem` for different integer types.
 ///
 /// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
-#define BIT_UNPACK_LANE(bits)                                                                                 \
+#define BIT_UNPACK_LANE(bits)                                                                                \
     template <>                                                                                              \
-    __device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in,                  \
-                                                                  uint##bits##_t *out,                       \
-                                                                  unsigned int lane, uint32_t bw) {          \
+    __device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in,                         \
+                                                           uint##bits##_t *out,                              \
+                                                           unsigned int lane,                                \
+                                                           uint32_t bw) {                                    \
         bit_unpack_##bits##_lane(in, out, lane, bw);                                                         \
     }
 
diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh
index cf7fba2e848..3b3cc2f73cf 100644
--- a/vortex-cuda/kernels/src/config.cuh
+++ b/vortex-cuda/kernels/src/config.cuh
@@ -16,4 +16,4 @@ constexpr uint32_t ELEMENTS_PER_THREAD = 32;
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
 #define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len))
-#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
+#define STOP_ELEM(idx, len)  MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
diff --git a/vortex-cuda/kernels/src/config_check.cu b/vortex-cuda/kernels/src/config_check.cu
index c3d7db0fe41..ceb0a67977a 100644
--- a/vortex-cuda/kernels/src/config_check.cu
+++ b/vortex-cuda/kernels/src/config_check.cu
@@ -13,7 +13,7 @@
 
 // Kernel that outputs the config values for verification.
 // Output buffer layout: [elements_per_thread, block_dim_x, elements_per_block]
-extern "C" __global__ void config_check(uint32_t* output) {
+extern "C" __global__ void config_check(uint32_t *output) {
     if (threadIdx.x == 0 && blockIdx.x == 0) {
         output[0] = ELEMENTS_PER_THREAD;
         output[1] = blockDim.x;
diff --git a/vortex-cuda/kernels/src/constant_numeric.cu b/vortex-cuda/kernels/src/constant_numeric.cu
index 197fae81182..7a6d94d9e5c 100644
--- a/vortex-cuda/kernels/src/constant_numeric.cu
+++ b/vortex-cuda/kernels/src/constant_numeric.cu
@@ -6,12 +6,8 @@
 #include <cuda_fp16.h>
 
 // Fill an output buffer with a constant value.
-template<typename T>
-__device__ void constant_fill(
-    T *__restrict output,
-    T value,
-    uint64_t array_len
-) {
+template <typename T>
+__device__ void constant_fill(T *__restrict output, T value, uint64_t array_len) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
     const uint64_t startElem = START_ELEM(worker, array_len);
     const uint64_t stopElem = STOP_ELEM(worker, array_len);
@@ -25,13 +21,11 @@ __device__ void constant_fill(
     }
 }
 
-#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \
-extern "C" __global__ void constant_numeric_##suffix( \
-    Type *__restrict output, \
-    Type value, \
-    uint64_t array_len \
-) { \
-    constant_fill(output, value, array_len); \
-}
+#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type)                                                       \
+    extern "C" __global__ void constant_numeric_##suffix(Type *__restrict output,                            \
+                                                         Type value,                                         \
+                                                         uint64_t array_len) {                               \
+        constant_fill(output, value, array_len);                                                             \
+    }
 
 FOR_EACH_NUMERIC(GENERATE_CONSTANT_NUMERIC_KERNEL)
diff --git a/vortex-cuda/kernels/src/date_time_parts.cu b/vortex-cuda/kernels/src/date_time_parts.cu
index e5ed17e831d..ccb3e614991 100644
--- a/vortex-cuda/kernels/src/date_time_parts.cu
+++ b/vortex-cuda/kernels/src/date_time_parts.cu
@@ -7,15 +7,13 @@
 constexpr int64_t SECONDS_PER_DAY = 86400;
 
 // Combines date/time parts (days, seconds, subseconds) into timestamp values.
-template<typename DaysT, typename SecondsT, typename SubsecondsT>
-__device__ void date_time_parts(
-    const DaysT *__restrict days,
-    const SecondsT *__restrict seconds,
-    const SubsecondsT *__restrict subseconds,
-    int64_t divisor,
-    int64_t *__restrict output,
-    uint64_t array_len
-) {
+template <typename DaysT, typename SecondsT, typename SubsecondsT>
+__device__ void date_time_parts(const DaysT *__restrict days,
+                                const SecondsT *__restrict seconds,
+                                const SubsecondsT *__restrict subseconds,
+                                int64_t divisor,
+                                int64_t *__restrict output,
+                                uint64_t array_len) {
     const int64_t ticks_per_day = SECONDS_PER_DAY * divisor;
     const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD;
 
@@ -23,40 +21,43 @@ __device__ void date_time_parts(
     const uint64_t block_end = min(block_start + elements_per_block, array_len);
 
     for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
-        output[idx] = static_cast<int64_t>(days[idx]) * ticks_per_day
-                    + static_cast<int64_t>(seconds[idx]) * divisor
-                    + static_cast<int64_t>(subseconds[idx]);
+        output[idx] = static_cast<int64_t>(days[idx]) * ticks_per_day +
+                      static_cast<int64_t>(seconds[idx]) * divisor + static_cast<int64_t>(subseconds[idx]);
     }
 }
 
-#define GENERATE_DATE_TIME_PARTS_KERNEL(days_suffix, DaysT, seconds_suffix, SecondsT, subseconds_suffix, SubsecondsT) \
-extern "C" __global__ void date_time_parts_##days_suffix##_##seconds_suffix##_##subseconds_suffix( \
-    const DaysT *__restrict days, \
-    const SecondsT *__restrict seconds, \
-    const SubsecondsT *__restrict subseconds, \
-    int64_t divisor, \
-    int64_t *__restrict output, \
-    uint64_t array_len \
-) { \
-    date_time_parts(days, seconds, subseconds, divisor, output, array_len); \
-}
+#define GENERATE_DATE_TIME_PARTS_KERNEL(days_suffix,                                                         \
+                                        DaysT,                                                               \
+                                        seconds_suffix,                                                      \
+                                        SecondsT,                                                            \
+                                        subseconds_suffix,                                                   \
+                                        SubsecondsT)                                                         \
+    extern "C" __global__ void date_time_parts_##days_suffix##_##seconds_suffix##_##subseconds_suffix(       \
+        const DaysT *__restrict days,                                                                        \
+        const SecondsT *__restrict seconds,                                                                  \
+        const SubsecondsT *__restrict subseconds,                                                            \
+        int64_t divisor,                                                                                     \
+        int64_t *__restrict output,                                                                          \
+        uint64_t array_len) {                                                                                \
+        date_time_parts(days, seconds, subseconds, divisor, output, array_len);                              \
+    }
 
-#define EXPAND_DAYS(X) \
-    X(i8, int8_t) \
-    X(i16, int16_t) \
-    X(i32, int32_t) \
+#define EXPAND_DAYS(X)                                                                                       \
+    X(i8, int8_t)                                                                                            \
+    X(i16, int16_t)                                                                                          \
+    X(i32, int32_t)                                                                                          \
     X(i64, int64_t)
 
-#define EXPAND_SUBSECONDS(d, DT, s, ST) \
-    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i8, int8_t) \
-    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i16, int16_t) \
-    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i32, int32_t) \
+#define EXPAND_SUBSECONDS(d, DT, s, ST)                                                                      \
+    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i8, int8_t)                                                \
+    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i16, int16_t)                                              \
+    GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i32, int32_t)                                              \
     GENERATE_DATE_TIME_PARTS_KERNEL(d, DT, s, ST, i64, int64_t)
 
-#define EXPAND_SECONDS(d, DT) \
-    EXPAND_SUBSECONDS(d, DT, i8, int8_t) \
-    EXPAND_SUBSECONDS(d, DT, i16, int16_t) \
-    EXPAND_SUBSECONDS(d, DT, i32, int32_t) \
+#define EXPAND_SECONDS(d, DT)                                                                                \
+    EXPAND_SUBSECONDS(d, DT, i8, int8_t)                                                                     \
+    EXPAND_SUBSECONDS(d, DT, i16, int16_t)                                                                   \
+    EXPAND_SUBSECONDS(d, DT, i32, int32_t)                                                                   \
     EXPAND_SUBSECONDS(d, DT, i64, int64_t)
 
 // Generate all 64 kernels (4³)
diff --git a/vortex-cuda/kernels/src/dict.cu b/vortex-cuda/kernels/src/dict.cu
index d9ba7d36b30..3422980c65e 100644
--- a/vortex-cuda/kernels/src/dict.cu
+++ b/vortex-cuda/kernels/src/dict.cu
@@ -8,18 +8,15 @@
 #include "config.cuh"
 #include "types.cuh"
 
-template<typename ValueT, typename IndexT>
-__device__ void dict_kernel(
-    const IndexT *const __restrict codes,
-    uint64_t codes_len,
-    const ValueT *const __restrict values,
-    ValueT *const __restrict output
-) {
+template <typename ValueT, typename IndexT>
+__device__ void dict_kernel(const IndexT *const __restrict codes,
+                            uint64_t codes_len,
+                            const ValueT *const __restrict values,
+                            ValueT *const __restrict output) {
     const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD;
     const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * elements_per_block;
-    const uint64_t block_end = (block_start + elements_per_block < codes_len)
-        ? (block_start + elements_per_block)
-        : codes_len;
+    const uint64_t block_end =
+        (block_start + elements_per_block < codes_len) ? (block_start + elements_per_block) : codes_len;
 
     for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
         IndexT code = codes[idx];
@@ -28,21 +25,20 @@ __device__ void dict_kernel(
 }
 
 // Macro to generate dict kernels for all value/index type combinations
-#define GENERATE_DICT_KERNEL(value_suffix, ValueType, index_suffix, IndexType) \
-extern "C" __global__ void dict_##value_suffix##_##index_suffix( \
-    const IndexType *const __restrict codes, \
-    uint64_t codes_len, \
-    const ValueType *const __restrict values, \
-    ValueType *const __restrict output \
-) { \
-    dict_kernel<ValueType, IndexType>(codes, codes_len, values, output); \
-}
+#define GENERATE_DICT_KERNEL(value_suffix, ValueType, index_suffix, IndexType)                               \
+    extern "C" __global__ void dict_##value_suffix##_##index_suffix(                                         \
+        const IndexType *const __restrict codes,                                                             \
+        uint64_t codes_len,                                                                                  \
+        const ValueType *const __restrict values,                                                            \
+        ValueType *const __restrict output) {                                                                \
+        dict_kernel<ValueType, IndexType>(codes, codes_len, values, output);                                 \
+    }
 
 // Generate dict kernel for all index types (unsigned integers) for a given value type
-#define GENERATE_DICT_FOR_ALL_INDICES(value_suffix, ValueType) \
-    GENERATE_DICT_KERNEL(value_suffix, ValueType, u8, uint8_t) \
-    GENERATE_DICT_KERNEL(value_suffix, ValueType, u16, uint16_t) \
-    GENERATE_DICT_KERNEL(value_suffix, ValueType, u32, uint32_t) \
+#define GENERATE_DICT_FOR_ALL_INDICES(value_suffix, ValueType)                                               \
+    GENERATE_DICT_KERNEL(value_suffix, ValueType, u8, uint8_t)                                               \
+    GENERATE_DICT_KERNEL(value_suffix, ValueType, u16, uint16_t)                                             \
+    GENERATE_DICT_KERNEL(value_suffix, ValueType, u32, uint32_t)                                             \
     GENERATE_DICT_KERNEL(value_suffix, ValueType, u64, uint64_t)
 
 // Generate for all native ptypes & decimal values
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
index aa1390d05f9..8af4d8825f5 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.cu
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -30,9 +30,11 @@ constexpr uint32_t ELEMENTS_PER_BLOCK = 2048;
 /// * `chunk_len` - Number of elements in this chunk (may be less than ELEMENTS_PER_BLOCK for tail blocks)
 /// * `source_op` - The source operation descriptor containing the operation type and parameters
 template <typename T>
-__device__ inline void dynamic_source_op(const T *__restrict input, T *__restrict smem,
-                                               uint64_t chunk_start, uint32_t chunk_len,
-                                               const struct SourceOp &source_op) {
+__device__ inline void dynamic_source_op(const T *__restrict input,
+                                         T *__restrict smem,
+                                         uint64_t chunk_start,
+                                         uint32_t chunk_len,
+                                         const struct SourceOp &source_op) {
     constexpr uint32_t T_BITS = sizeof(T) * 8;
     constexpr uint32_t FL_LANES = ELEMENTS_PER_BLOCK / T_BITS;
 
@@ -47,7 +49,9 @@ __device__ inline void dynamic_source_op(const T *__restrict input, T *__restric
         // FL blocks must divide evenly. Otherwise, the last unpack would overflow `smem`.
         static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0);
 
-        const auto div_ceil = [](auto a, auto b) { return (a + b - 1) / b; };
+        const auto div_ceil = [](auto a, auto b) {
+            return (a + b - 1) / b;
+        };
         const uint32_t num_fl_chunks = div_ceil(chunk_len, FL_CHUNK_SIZE);
 
         for (uint32_t chunk_idx = 0; chunk_idx < num_fl_chunks; ++chunk_idx) {
@@ -60,7 +64,8 @@ __device__ inline void dynamic_source_op(const T *__restrict input, T *__restric
         }
         break;
     }
-    default: __builtin_unreachable();
+    default:
+        __builtin_unreachable();
     }
 }
 
@@ -90,7 +95,8 @@ __device__ inline T dynamic_scalar_op(T value, const struct ScalarOp &op) {
         float result = static_cast<float>(static_cast<int32_t>(value)) * op.params.alp.f * op.params.alp.e;
         return static_cast<T>(__float_as_uint(result));
     }
-    default: __builtin_unreachable();
+    default:
+        __builtin_unreachable();
     }
 }
 
@@ -107,7 +113,9 @@ __device__ inline T dynamic_scalar_op(T value, const struct ScalarOp &op) {
 /// * `array_len` - Total number of elements
 /// * `plan` - Operation sequence to apply
 template <typename T>
-__device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict output, uint64_t array_len,
+__device__ void dynamic_dispatch_impl(const T *__restrict input,
+                                      T *__restrict output,
+                                      uint64_t array_len,
                                       const struct DynamicDispatchPlan *__restrict plan) {
     constexpr uint32_t VALUES_PER_LOOP = 32 / sizeof(T);
 
@@ -115,7 +123,8 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o
     __shared__ T smem_values[ELEMENTS_PER_BLOCK];
 
     // Cache the plan in shared memory.
-    if (threadIdx.x == 0) smem_plan = *plan;
+    if (threadIdx.x == 0)
+        smem_plan = *plan;
     __syncthreads();
 
     const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * ELEMENTS_PER_BLOCK;
@@ -136,7 +145,9 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o
         // parallelism.
         T values[VALUES_PER_LOOP];
 
+        // clang-format off
         #pragma unroll
+        // clang-format on
         for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
             values[idx] = smem_values[tile_base + idx * blockDim.x + threadIdx.x];
         }
@@ -144,13 +155,17 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o
         for (uint8_t op_idx = 0; op_idx < smem_plan.num_scalar_ops; ++op_idx) {
             const struct ScalarOp &scalar_op = smem_plan.scalar_ops[op_idx];
 
+            // clang-format off
             #pragma unroll
+            // clang-format on
             for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
                 values[idx] = dynamic_scalar_op(values[idx], scalar_op);
             }
         }
 
+        // clang-format off
         #pragma unroll
+        // clang-format on
         for (uint32_t idx = 0; idx < VALUES_PER_LOOP; ++idx) {
             output[block_start + tile_base + idx * blockDim.x + threadIdx.x] = values[idx];
         }
@@ -171,9 +186,11 @@ __device__ void dynamic_dispatch_impl(const T *__restrict input, T *__restrict o
 ///
 /// Creates a CUDA kernel entry point by instantiating `dynamic_dispatch_impl` for the given type.
 #define GENERATE_DYNAMIC_DISPATCH_KERNEL(suffix, Type)                                                       \
-    extern "C" __global__ void dynamic_dispatch_##suffix(const Type *__restrict input,                       \
-                                                         Type *__restrict output, uint64_t array_len,        \
-                                                         const struct DynamicDispatchPlan *__restrict plan) { \
+    extern "C" __global__ void dynamic_dispatch_##suffix(                                                    \
+        const Type *__restrict input,                                                                        \
+        Type *__restrict output,                                                                             \
+        uint64_t array_len,                                                                                  \
+        const struct DynamicDispatchPlan *__restrict plan) {                                                 \
         dynamic_dispatch_impl<Type>(input, output, array_len, plan);                                         \
     }
 
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
index 1139e6e0370..f2ba1247b19 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.h
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -13,7 +13,9 @@ extern "C" {
 
 /// Source ops: Fills shared memory from input (exactly one, required)
 union SourceParams {
-    struct BitunpackParams { uint8_t bit_width; } bitunpack;
+    struct BitunpackParams {
+        uint8_t bit_width;
+    } bitunpack;
 };
 
 struct SourceOp {
@@ -23,8 +25,13 @@ struct SourceOp {
 
 /// Scalar ops: Element-wise transforms in registers (0 or more)
 union ScalarParams {
-    struct FoRParams { uint64_t reference; } frame_of_ref;
-    struct AlpParams { float f; float e; } alp;
+    struct FoRParams {
+        uint64_t reference;
+    } frame_of_ref;
+    struct AlpParams {
+        float f;
+        float e;
+    } alp;
 };
 
 struct ScalarOp {
@@ -37,7 +44,7 @@ struct ScalarOp {
 
 struct DynamicDispatchPlan {
     struct SourceOp source;
-    uint8_t         num_scalar_ops;
+    uint8_t num_scalar_ops;
     struct ScalarOp scalar_ops[MAX_SCALAR_OPS];
 };
 
diff --git a/vortex-cuda/kernels/src/for.cu b/vortex-cuda/kernels/src/for.cu
index 653f527b6b9..702a0bfdf7b 100644
--- a/vortex-cuda/kernels/src/for.cu
+++ b/vortex-cuda/kernels/src/for.cu
@@ -5,7 +5,7 @@
 #include "types.cuh"
 
 // Frame-of-Reference operation: adds a reference value to each element.
-template<typename T>
+template <typename T>
 struct ForOp {
     T reference;
 
@@ -15,25 +15,19 @@ struct ForOp {
 };
 
 // Macro to generate in-place FoR kernel for each type.
-#define GENERATE_FOR_KERNEL(suffix, Type) \
-extern "C" __global__ void for_##suffix( \
-    Type *__restrict values, \
-    Type reference, \
-    uint64_t array_len \
-) { \
-    scalar_kernel_inplace(values, array_len, ForOp<Type>{reference}); \
-}
+#define GENERATE_FOR_KERNEL(suffix, Type)                                                                    \
+    extern "C" __global__ void for_##suffix(Type *__restrict values, Type reference, uint64_t array_len) {   \
+        scalar_kernel_inplace(values, array_len, ForOp<Type> {reference});                                   \
+    }
 
 // Macro to generate FoR kernel with separate input/output buffers.
-#define GENERATE_FOR_IN_OUT_KERNEL(suffix, Type) \
-extern "C" __global__ void for_in_out_##suffix( \
-    const Type *__restrict input, \
-    Type *__restrict output, \
-    Type reference, \
-    uint64_t array_len \
-) { \
-    scalar_kernel(input, output, array_len, ForOp<Type>{reference}); \
-}
+#define GENERATE_FOR_IN_OUT_KERNEL(suffix, Type)                                                             \
+    extern "C" __global__ void for_in_out_##suffix(const Type *__restrict input,                             \
+                                                   Type *__restrict output,                                  \
+                                                   Type reference,                                           \
+                                                   uint64_t array_len) {                                     \
+        scalar_kernel(input, output, array_len, ForOp<Type> {reference});                                    \
+    }
 
 // In-place variants (modifies input buffer) - FoR is only used for integers
 FOR_EACH_INTEGER(GENERATE_FOR_KERNEL)
diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
index 00945e50e63..8a87fcd67e7 100644
--- a/vortex-cuda/kernels/src/patches.cu
+++ b/vortex-cuda/kernels/src/patches.cu
@@ -7,13 +7,11 @@
 // TODO(aduffy): this is very naive. In the future we need to
 //   transpose the patches, see G-ALP paper.
 // Apply patches to a source array
-template<typename ValueT, typename IndexT>
-__device__ void patches(
-    ValueT *const values,
-    const IndexT *const patchIndices,
-    const ValueT *const patchValues,
-    uint64_t patchesLen
-) {
+template <typename ValueT, typename IndexT>
+__device__ void patches(ValueT *const values,
+                        const IndexT *const patchIndices,
+                        const ValueT *const patchValues,
+                        uint64_t patchesLen) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
     const uint64_t startElem = START_ELEM(worker, patchesLen);
     const uint64_t stopElem = STOP_ELEM(worker, patchesLen);
@@ -31,21 +29,19 @@ __device__ void patches(
     }
 }
 
-#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \
-extern "C" __global__ void patches_##value_suffix##_##index_suffix( \
-    ValueT *const values, \
-    const IndexT *const patchIndices, \
-    const ValueT *const patchValues, \
-    uint64_t patchesLen \
-) { \
-    patches(values, patchIndices, patchValues, patchesLen); \
-}
+#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix)                                  \
+    extern "C" __global__ void patches_##value_suffix##_##index_suffix(ValueT *const values,                 \
+                                                                       const IndexT *const patchIndices,     \
+                                                                       const ValueT *const patchValues,      \
+                                                                       uint64_t patchesLen) {                \
+        patches(values, patchIndices, patchValues, patchesLen);                                              \
+    }
 
 // Generate patches kernel for all index types (unsigned integers) for a given value type
-#define GENERATE_PATCHES_FOR_ALL_INDICES(value_suffix, ValueT) \
-    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \
-    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \
-    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \
+#define GENERATE_PATCHES_FOR_ALL_INDICES(value_suffix, ValueT)                                               \
+    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8)                                               \
+    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16)                                             \
+    GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32)                                             \
     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64)
 
 // Generate for all native SIMD ptypes
diff --git a/vortex-cuda/kernels/src/runend.cu b/vortex-cuda/kernels/src/runend.cu
index 5cac5314c0a..dda94e87a9e 100644
--- a/vortex-cuda/kernels/src/runend.cu
+++ b/vortex-cuda/kernels/src/runend.cu
@@ -19,18 +19,18 @@ constexpr uint32_t MAX_CACHED_RUNS = 512;
 /// is chosen as the binary search runs on a single GPU thread. This is
 /// preferred over `thrust::device` as this would spawn an additional kernel
 /// launch.
-/// See: https://nvidia.github.io/cccl/thrust/api/group__binary__search_1gac85cc9ea00f4bdd8f80ad25fff16741d.html#thrust-upper-bound
+/// See:
+/// https://nvidia.github.io/cccl/thrust/api/group__binary__search_1gac85cc9ea00f4bdd8f80ad25fff16741d.html#thrust-upper-bound
 ///
 /// Returns the index of the first element that is greater than `value`, or
 /// `len` if no such element exists.
-template<typename T>
+template <typename T>
 __device__ inline uint64_t upper_bound(const T *data, uint64_t len, uint64_t value) {
 
     auto it = thrust::upper_bound(thrust::seq, data, data + len, value);
     return it - data;
 }
 
-
 // Decodes run-end encoded data on the GPU.
 //
 // Run-end stores data as pairs of (value, end_position) where each run contains
@@ -58,15 +58,13 @@ __device__ inline uint64_t upper_bound(const T *data, uint64_t len, uint64_t val
 //       fall back to binary search in global memory for each element.
 //
 // TODO(0ax1): Investigate whether there are faster solutions.
-template<typename ValueT, typename EndsT>
-__device__ void runend_decode_kernel(
-    const EndsT *const __restrict ends,
-    uint64_t num_runs,
-    const ValueT *const __restrict values,
-    uint64_t offset,
-    uint64_t output_len,
-    ValueT *const __restrict output
-) {
+template <typename ValueT, typename EndsT>
+__device__ void runend_decode_kernel(const EndsT *const __restrict ends,
+                                     uint64_t num_runs,
+                                     const ValueT *const __restrict values,
+                                     uint64_t offset,
+                                     uint64_t output_len,
+                                     ValueT *const __restrict output) {
     __shared__ EndsT shared_ends[MAX_CACHED_RUNS];
     __shared__ ValueT shared_values[MAX_CACHED_RUNS];
     __shared__ uint64_t block_first_run;
@@ -76,7 +74,8 @@ __device__ void runend_decode_kernel(
     const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * elements_per_block;
     const uint64_t block_end = min(block_start + elements_per_block, output_len);
 
-    if (block_start >= output_len) return;
+    if (block_start >= output_len)
+        return;
 
     // Thread 0 finds the run range for this block.
     if (threadIdx.x == 0) {
@@ -87,7 +86,8 @@ __device__ void runend_decode_kernel(
         uint64_t last_run = upper_bound(ends, num_runs, last_pos);
 
         block_first_run = first_run;
-        block_num_runs = static_cast<uint32_t>(min(last_run - first_run + 1, static_cast<uint64_t>(MAX_CACHED_RUNS)));
+        block_num_runs =
+            static_cast<uint32_t>(min(last_run - first_run + 1, static_cast<uint64_t>(MAX_CACHED_RUNS)));
     }
     __syncthreads();
 
@@ -118,28 +118,28 @@ __device__ void runend_decode_kernel(
         for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
             uint64_t pos = idx + offset;
             uint64_t run_idx = upper_bound(ends, num_runs, pos);
-            if (run_idx >= num_runs) run_idx = num_runs - 1;
+            if (run_idx >= num_runs)
+                run_idx = num_runs - 1;
             output[idx] = values[run_idx];
         }
     }
 }
 
-#define GENERATE_RUNEND_KERNEL(value_suffix, ValueType, ends_suffix, EndsType) \
-extern "C" __global__ void runend_##value_suffix##_##ends_suffix( \
-    const EndsType *const __restrict ends, \
-    uint64_t num_runs, \
-    const ValueType *const __restrict values, \
-    uint64_t offset, \
-    uint64_t output_len, \
-    ValueType *const __restrict output \
-) { \
-    runend_decode_kernel<ValueType, EndsType>(ends, num_runs, values, offset, output_len, output); \
-}
+#define GENERATE_RUNEND_KERNEL(value_suffix, ValueType, ends_suffix, EndsType)                               \
+    extern "C" __global__ void runend_##value_suffix##_##ends_suffix(                                        \
+        const EndsType *const __restrict ends,                                                               \
+        uint64_t num_runs,                                                                                   \
+        const ValueType *const __restrict values,                                                            \
+        uint64_t offset,                                                                                     \
+        uint64_t output_len,                                                                                 \
+        ValueType *const __restrict output) {                                                                \
+        runend_decode_kernel<ValueType, EndsType>(ends, num_runs, values, offset, output_len, output);       \
+    }
 
-#define GENERATE_RUNEND_KERNELS_FOR_VALUE(value_suffix, ValueType) \
-    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u8, uint8_t) \
-    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u16, uint16_t) \
-    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u32, uint32_t) \
+#define GENERATE_RUNEND_KERNELS_FOR_VALUE(value_suffix, ValueType)                                           \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u8, uint8_t)                                             \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u16, uint16_t)                                           \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u32, uint32_t)                                           \
     GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u64, uint64_t)
 
 GENERATE_RUNEND_KERNELS_FOR_VALUE(u8, uint8_t)
diff --git a/vortex-cuda/kernels/src/scalar_kernel.cuh b/vortex-cuda/kernels/src/scalar_kernel.cuh
index 93564fde3d1..530b4b14dff 100644
--- a/vortex-cuda/kernels/src/scalar_kernel.cuh
+++ b/vortex-cuda/kernels/src/scalar_kernel.cuh
@@ -14,18 +14,13 @@
 // Launch config: grid_dim = (array_len / 2048, 1, 1), block_dim = (64, 1, 1)
 // Each block handles 2048 elements, 64 threads per block.
 // Vectorized to process 16 bytes per iteration for better memory throughput.
-template<typename InputT, typename OutputT, typename Op>
-__device__ void scalar_kernel(
-    const InputT *__restrict in,
-    OutputT *__restrict out,
-    uint64_t array_len,
-    Op op
-) {
+template <typename InputT, typename OutputT, typename Op>
+__device__ void
+scalar_kernel(const InputT *__restrict in, OutputT *__restrict out, uint64_t array_len, Op op) {
     const uint32_t elements_per_block = 2048;
     const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * elements_per_block;
-    const uint64_t block_end = (block_start + elements_per_block < array_len)
-        ? (block_start + elements_per_block)
-        : array_len;
+    const uint64_t block_end =
+        (block_start + elements_per_block < array_len) ? (block_start + elements_per_block) : array_len;
 
     // Vectorized loop - process 16 bytes per iteration for better memory throughput.
     constexpr auto VALUES_PER_LOOP = 16 / sizeof(InputT);
@@ -36,7 +31,9 @@ __device__ void scalar_kernel(
         uint64_t base_idx = idx * VALUES_PER_LOOP;
 
         // The loop can be unrolled, as `VALUES_PER_LOOP` is `constexpr`.
+        // clang-format off
         #pragma unroll
+        // clang-format on
         for (uint64_t i = 0; i < VALUES_PER_LOOP; ++i) {
             out[base_idx + i] = op(in[base_idx + i]);
         }
@@ -50,11 +47,7 @@ __device__ void scalar_kernel(
 }
 
 // In-place variant (same input/output buffer, same type).
-template<typename T, typename Op>
-__device__ void scalar_kernel_inplace(
-    T *__restrict values,
-    uint64_t array_len,
-    Op op
-) {
+template <typename T, typename Op>
+__device__ void scalar_kernel_inplace(T *__restrict values, uint64_t array_len, Op op) {
     scalar_kernel<T, T>(values, values, array_len, op);
 }
diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu
index eb9df20b46e..d616b6a908e 100644
--- a/vortex-cuda/kernels/src/sequence.cu
+++ b/vortex-cuda/kernels/src/sequence.cu
@@ -3,13 +3,8 @@
 
 #include "config.cuh"
 
-template<typename ValueT>
-__device__ void sequence(
-    ValueT *const output,
-    ValueT base,
-    ValueT multiplier,
-    uint64_t len
-) {
+template <typename ValueT>
+__device__ void sequence(ValueT *const output, ValueT base, ValueT multiplier, uint64_t len) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
 
     const uint64_t startElem = START_ELEM(worker, len);
@@ -20,15 +15,13 @@ __device__ void sequence(
     }
 }
 
-#define GENERATE_KERNEL(ValueT, suffix) \
-extern "C" __global__ void sequence_##suffix( \
-    ValueT *const output, \
-    ValueT base, \
-    ValueT multiplier, \
-    uint64_t len \
-) { \
-    sequence(output, base, multiplier, len); \
-}
+#define GENERATE_KERNEL(ValueT, suffix)                                                                      \
+    extern "C" __global__ void sequence_##suffix(ValueT *const output,                                       \
+                                                 ValueT base,                                                \
+                                                 ValueT multiplier,                                          \
+                                                 uint64_t len) {                                             \
+        sequence(output, base, multiplier, len);                                                             \
+    }
 
 GENERATE_KERNEL(uint8_t, u8);
 GENERATE_KERNEL(uint16_t, u16);
diff --git a/vortex-cuda/kernels/src/types.cuh b/vortex-cuda/kernels/src/types.cuh
index 54d14aedc3d..519a89f4630 100644
--- a/vortex-cuda/kernels/src/types.cuh
+++ b/vortex-cuda/kernels/src/types.cuh
@@ -21,48 +21,48 @@ struct __align__(32) int256_t {
 // These mirror the Rust match_each_*_ptype macros.
 
 // Unsigned integers
-#define FOR_EACH_UNSIGNED_INT(MACRO) \
-    MACRO(u8, uint8_t) \
-    MACRO(u16, uint16_t) \
-    MACRO(u32, uint32_t) \
+#define FOR_EACH_UNSIGNED_INT(MACRO)                                                                         \
+    MACRO(u8, uint8_t)                                                                                       \
+    MACRO(u16, uint16_t)                                                                                     \
+    MACRO(u32, uint32_t)                                                                                     \
     MACRO(u64, uint64_t)
 
 // Signed integers
-#define FOR_EACH_SIGNED_INT(MACRO) \
-    MACRO(i8, int8_t) \
-    MACRO(i16, int16_t) \
-    MACRO(i32, int32_t) \
+#define FOR_EACH_SIGNED_INT(MACRO)                                                                           \
+    MACRO(i8, int8_t)                                                                                        \
+    MACRO(i16, int16_t)                                                                                      \
+    MACRO(i32, int32_t)                                                                                      \
     MACRO(i64, int64_t)
 
 // All integers (signed + unsigned)
-#define FOR_EACH_INTEGER(MACRO) \
-    FOR_EACH_UNSIGNED_INT(MACRO) \
+#define FOR_EACH_INTEGER(MACRO)                                                                              \
+    FOR_EACH_UNSIGNED_INT(MACRO)                                                                             \
     FOR_EACH_SIGNED_INT(MACRO)
 
 // All floating point types (requires #include <cuda_fp16.h>)
-#define FOR_EACH_FLOAT(MACRO) \
-    MACRO(f16, __half) \
-    MACRO(f32, float) \
+#define FOR_EACH_FLOAT(MACRO)                                                                                \
+    MACRO(f16, __half)                                                                                       \
+    MACRO(f32, float)                                                                                        \
     MACRO(f64, double)
 
 // Native SIMD types (integers + f32/f64, matches match_each_native_simd_ptype)
-#define FOR_EACH_NATIVE_SIMD_PTYPE(MACRO) \
-    FOR_EACH_INTEGER(MACRO) \
-    MACRO(f32, float) \
+#define FOR_EACH_NATIVE_SIMD_PTYPE(MACRO)                                                                    \
+    FOR_EACH_INTEGER(MACRO)                                                                                  \
+    MACRO(f32, float)                                                                                        \
     MACRO(f64, double)
 
 // All native ptypes (requires #include <cuda_fp16.h>, matches match_each_native_ptype)
-#define FOR_EACH_NATIVE_PTYPE(MACRO) \
-    FOR_EACH_INTEGER(MACRO) \
+#define FOR_EACH_NATIVE_PTYPE(MACRO)                                                                         \
+    FOR_EACH_INTEGER(MACRO)                                                                                  \
     FOR_EACH_FLOAT(MACRO)
 
 // Large decimal types (128-bit and 256-bit integers for decimal representation).
 // Use alongside FOR_EACH_NATIVE_PTYPE for full type coverage.
-#define FOR_EACH_LARGE_DECIMAL(MACRO) \
-    MACRO(i128, int128_t) \
+#define FOR_EACH_LARGE_DECIMAL(MACRO)                                                                        \
+    MACRO(i128, int128_t)                                                                                    \
     MACRO(i256, int256_t)
 
 // All numeric types: native ptypes + large decimals (requires #include <cuda_fp16.h>)
-#define FOR_EACH_NUMERIC(MACRO) \
-    FOR_EACH_NATIVE_PTYPE(MACRO) \
+#define FOR_EACH_NUMERIC(MACRO)                                                                              \
+    FOR_EACH_NATIVE_PTYPE(MACRO)                                                                             \
     FOR_EACH_LARGE_DECIMAL(MACRO)
diff --git a/vortex-cuda/kernels/src/varbinview.cuh b/vortex-cuda/kernels/src/varbinview.cuh
index 41a97a864a6..d1dd67a710f 100644
--- a/vortex-cuda/kernels/src/varbinview.cuh
+++ b/vortex-cuda/kernels/src/varbinview.cuh
@@ -9,10 +9,10 @@
 constexpr int32_t MAX_INLINED_SIZE = 12;
 
 // a byte buffer holding string data
-typedef uint8_t* Buffer;
+typedef uint8_t *Buffer;
 
 // an i32 offsets buffer
-typedef int32_t* Offsets;
+typedef int32_t *Offsets;
 
 struct InlinedBinaryView {
     int32_t size;
diff --git a/vortex-cuda/kernels/src/varbinview_compute_offsets.cu b/vortex-cuda/kernels/src/varbinview_compute_offsets.cu
index 600e0f6e5f0..7b7434fd313 100644
--- a/vortex-cuda/kernels/src/varbinview_compute_offsets.cu
+++ b/vortex-cuda/kernels/src/varbinview_compute_offsets.cu
@@ -5,12 +5,10 @@
 #include "varbinview.cuh"
 
 // single-threaded, compute offsets
-extern "C" __global__ void varbinview_compute_offsets(
-    const BinaryView *views,
-    int64_t num_strings,
-    Offsets out_offsets,
-    int32_t *last_offset
-) {
+extern "C" __global__ void varbinview_compute_offsets(const BinaryView *views,
+                                                      int64_t num_strings,
+                                                      Offsets out_offsets,
+                                                      int32_t *last_offset) {
     const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     // force execution to be single-threaded to compute the prefix
diff --git a/vortex-cuda/kernels/src/varbinview_copy_strings.cu b/vortex-cuda/kernels/src/varbinview_copy_strings.cu
index ae11fa6f1ab..f843c9f3b3c 100644
--- a/vortex-cuda/kernels/src/varbinview_copy_strings.cu
+++ b/vortex-cuda/kernels/src/varbinview_copy_strings.cu
@@ -4,13 +4,8 @@
 #include "config.cuh"
 #include "varbinview.cuh"
 
-// Lookup a string from a binary view, copying it into
-// a destination buffer.
-__device__ void copy_string_to_dst(
-    BinaryView& view,
-    Buffer *buffers,
-    uint8_t *dst
-) {
+// Lookup a string from a binary view, copying it into a destination buffer.
+__device__ void copy_string_to_dst(BinaryView &view, Buffer *buffers, uint8_t *dst) {
     int32_t size = view.inlined.size;
     uint8_t *src;
     if (size <= MAX_INLINED_SIZE) {
@@ -23,13 +18,11 @@ __device__ void copy_string_to_dst(
     memcpy(dst, src, size);
 }
 
-extern "C" __global__ void varbinview_copy_strings(
-    int64_t len,
-    BinaryView* views,
-    Buffer* buffers,
-    Buffer dst_buffer,
-    Offsets dst_offsets
-) {
+extern "C" __global__ void varbinview_copy_strings(int64_t len,
+                                                   BinaryView *views,
+                                                   Buffer *buffers,
+                                                   Buffer dst_buffer,
+                                                   Offsets dst_offsets) {
     const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     // Each thread is responsible for copying a single string.
@@ -43,4 +36,4 @@ extern "C" __global__ void varbinview_copy_strings(
     uint8_t *dst = &dst_buffer[offset];
 
     copy_string_to_dst(view, buffers, dst);
-}
\ No newline at end of file
+}
diff --git a/vortex-cuda/kernels/src/zigzag.cu b/vortex-cuda/kernels/src/zigzag.cu
index 708e20a1be7..4059452da56 100644
--- a/vortex-cuda/kernels/src/zigzag.cu
+++ b/vortex-cuda/kernels/src/zigzag.cu
@@ -4,10 +4,11 @@
 #include "scalar_kernel.cuh"
 
 // ZigZag decode operation.
+//
 // Converts unsigned integers back to signed using the ZigZag encoding scheme.
 // Formula: decoded = (encoded >> 1) ^ -(encoded & 1)
 // This interleaves positive and negative numbers: 0, -1, 1, -2, 2, -3, ...
-template<typename UnsignedT, typename SignedT>
+template <typename UnsignedT, typename SignedT>
 struct ZigZagOp {
     __device__ inline SignedT operator()(UnsignedT value) const {
         // ZigZag decode: (n >> 1) ^ -(n & 1)
@@ -18,14 +19,13 @@ struct ZigZagOp {
 
 // Macro to generate ZigZag kernel for each type.
 // In-place operation: unsigned input, signed output (same size, reinterpret).
-#define GENERATE_ZIGZAG_KERNEL(suffix, UnsignedType, SignedType) \
-extern "C" __global__ void zigzag_##suffix( \
-    UnsignedType *__restrict values, \
-    uint64_t array_len \
-) { \
-    scalar_kernel(values, reinterpret_cast<SignedType*>(values), array_len, \
-                  ZigZagOp<UnsignedType, SignedType>{}); \
-}
+#define GENERATE_ZIGZAG_KERNEL(suffix, UnsignedType, SignedType)                                             \
+    extern "C" __global__ void zigzag_##suffix(UnsignedType *__restrict values, uint64_t array_len) {        \
+        scalar_kernel(values,                                                                                \
+                      reinterpret_cast<SignedType *>(values),                                                \
+                      array_len,                                                                             \
+                      ZigZagOp<UnsignedType, SignedType> {});                                                \
+    }
 
 GENERATE_ZIGZAG_KERNEL(u8, uint8_t, int8_t)
 GENERATE_ZIGZAG_KERNEL(u16, uint16_t, int16_t)