Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 97 additions & 39 deletions assembly/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,67 +19,125 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.*/

/**
* Basic scalar addition.
*/
export function add(a: i32, b: i32): i32 {
return a + b;
}

/**
* Standard scalar array addition using direct pointer arithmetic.
* Performs: A[i] = A[i] + B[i]
*/
export function addF32Arrays(aPtr: usize, bPtr: usize, length: u32): void {
// now handle any remaining elements
for (let i: u32 = 0 /*length & ~7*/; i < length; i++) {
const aOffset = aPtr + (i << 2); // i * 4 (each Float32 is 4 bytes)
const bOffset = bPtr + (i << 2);
const endPtr = aPtr + (<usize>length << 2); // end = start + (length * 4 bytes)

while (aPtr < endPtr) {
// Load values
const va = load<f32>(aPtr);
const vb = load<f32>(bPtr);

const va = f32.load(aOffset);
const vb = f32.load(bOffset);
// Store result
store<f32>(aPtr, va + vb);

f32.store(aOffset, va + vb);
// Advance pointers by 4 bytes (size of f32)
aPtr += 4;
bPtr += 4;
}
}

/**
* SIMD optimized array addition.
* Processes 4 floats (128 bits) per instruction.
*/
export function addF32ArraysSimd4(aPtr: usize, bPtr: usize, length: u32): void {
const stride = 4; // Process 4 f32 elements per SIMD operation
let i: u32 = 0;
const end = length - stride;
// Ensure we have at least 4 elements to process
if (length >= 4) {
// Calculate the end boundary for the vector loop
// We floor the length to the nearest multiple of 4
const vectorLoopEnd = length & ~3;
const endPtr = aPtr + (<usize>vectorLoopEnd << 2);

// Handle SIMD operations for chunks of 4 elements
for (; i < end; i += stride) {
v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
aPtr += stride * sizeof<f32>();
bPtr += stride * sizeof<f32>();
while (aPtr < endPtr) {
// Load 128-bit vectors (4 floats each)
const va = v128.load(aPtr);
const vb = v128.load(bPtr);

// Perform vector addition
const res = f32x4.add(va, vb);

// Store result back to A
v128.store(aPtr, res);

// Advance pointers by 16 bytes (size of v128)
aPtr += 16;
bPtr += 16;
}

// Update the remaining length for the scalar fallback
length -= vectorLoopEnd;
}

// Handle remaining elements that aren't divisible by 4
for (; i < length; i++) {
// Handle remaining elements (0-3 elements)
//
while (length > 0) {
store<f32>(aPtr, load<f32>(aPtr) + load<f32>(bPtr));
aPtr += sizeof<f32>();
bPtr += sizeof<f32>();
aPtr += 4;
bPtr += 4;
length--;
}
}

/**
* Unrolled SIMD array addition.
* Processes 8 floats per loop iteration (2 vectors) to reduce loop overhead.
*/
export function addF32ArraysSimd4Unrolled(aPtr: usize, bPtr: usize, length: u32): void {
const stride = 4; // Process 4 f32 elements per SIMD operation
const unrollFactor = 2; // Number of SIMD operations per loop iteration
const totalStride = stride * unrollFactor; // Total elements processed per loop iteration
let i: u32 = 0;
const end = length - (length % totalStride);

// Handle SIMD operations for chunks of totalStride elements
for (; i < end; i += totalStride) {
// First SIMD operation
v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
aPtr += stride * sizeof<f32>();
bPtr += stride * sizeof<f32>();
// We need at least 8 elements to enter the unrolled loop
if (length >= 8) {
const unrollMask = ~7; // Multiple of 8
const vectorLoopLen = length & unrollMask;
const endPtr = aPtr + (<usize>vectorLoopLen << 2);

while (aPtr < endPtr) {
// Load 2 vectors from A and 2 vectors from B
// Pipelining loads often helps hide memory latency
const va1 = v128.load(aPtr);
const vb1 = v128.load(bPtr);
const va2 = v128.load(aPtr, 16); // Load with offset 16
const vb2 = v128.load(bPtr, 16);

// Perform additions
const res1 = f32x4.add(va1, vb1);
const res2 = f32x4.add(va2, vb2);

// Store results
v128.store(aPtr, res1);
v128.store(aPtr, res2, 16); // Store with offset 16

// Second SIMD operation
// Advance pointers by 32 bytes (8 floats * 4 bytes)
aPtr += 32;
bPtr += 32;
}

length -= vectorLoopLen;
}

// Fallback: Try single SIMD block (4 elements) if enough remain
if (length >= 4) {
v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
aPtr += stride * sizeof<f32>();
bPtr += stride * sizeof<f32>();
aPtr += 16;
bPtr += 16;
length -= 4;
}

// Handle remaining elements that aren't divisible by totalStride
for (; i < length; i++) {
// Fallback: Handle final remaining scalars (0-3 elements)
while (length > 0) {
store<f32>(aPtr, load<f32>(aPtr) + load<f32>(bPtr));
aPtr += sizeof<f32>();
bPtr += sizeof<f32>();
aPtr += 4;
bPtr += 4;
length--;
}
}
}
17 changes: 9 additions & 8 deletions assembly/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
{
"extends": "assemblyscript/std/assembly.json",
"compilerOptions": {
"target": "esnext",
"module": "esnext",
"moduleResolution": "node",
"noEmit": true,
"types": [],
"strict": false,
"noImplicitAny": false,
"noUnusedLocals": false,
"noUnusedParameters": false,
"experimentalDecorators": true
"strict": true,
"noImplicitAny": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"experimentalDecorators": true,
"types": []
},
"include": [
"./**/*.ts"
],
"extends": "../../node_modules/.pnpm/assemblyscript@0.27.30/node_modules/assemblyscript/std/assembly.json"
]
}
136 changes: 78 additions & 58 deletions c/vector_addition.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
* ADOBE CONFIDENTIAL
* ___________________
*
* Copyright 2025 Adobe
* All Rights Reserved.
* Copyright 2025 Adobe
* All Rights Reserved.
*
* NOTICE: All information contained herein is, and remains
* the property of Adobe and its suppliers, if any. The intellectual
Expand All @@ -20,106 +20,126 @@
#include <stdlib.h>
#include <arm_neon.h>
#include <time.h>
#include <pthread.h>

// Vector3 struct to store three floats
/*
* Optimization Note:
* Treating data as Array of Structures (AoS) - {x,y,z}, {x,y,z}...
* usually incurs a penalty compared to Structure of Arrays (SoA) - {x,x...}, {y,y...}.
* However, using NEON's `vld3q` / `vst3q` allows us to de-interleave on load
* and re-interleave on store effectively, mitigating the AoS penalty.
*/

typedef struct
{
float x, y, z;
} Vector3;

#define ARRAY_SIZE 250000
#define NUM_RUNS 10

// When this runs with interleaved data we get 5400 MFLOPS
// When I run with non interleaved data we get 17000 MFLOPS
#define NUM_RUNS 500 // Increased runs for more stable benchmarking
#define ALIGNMENT 16 // 128-bit alignment

// High precision timer function using clock_gettime with CLOCK_MONOTONIC_RAW
// High precision timer
double get_time()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}

void add_vectors(Vector3 *a, Vector3 *b, Vector3 *result, int size)
/**
* Adds two arrays of Vector3 using NEON intrinsics.
* Uses vld3q/vst3q to handle the Interleaved (AoS) data layout.
*/
void add_vectors_neon(const Vector3 *restrict a, const Vector3 *restrict b, Vector3 *restrict result, int size)
{
for (int i = 0; i < size; i += 4)
// Ensure we process 4 vectors per iteration
int i = 0;
for (; i <= size - 4; i += 4)
{
// Load 3-element structures de-interleaved into 3 registers (x, y, z)
// input: x0 y0 z0 x1 y1 z1 ...
// va.val[0] = {x0, x1, x2, x3}
// va.val[1] = {y0, y1, y2, y3}
// va.val[2] = {z0, z1, z2, z3}
float32x4x3_t va = vld3q_f32((const float *)&a[i]);
float32x4x3_t vb = vld3q_f32((const float *)&b[i]);
float32x4x3_t vres;

// Vectorized Addition
vres.val[0] = vaddq_f32(va.val[0], vb.val[0]);
vres.val[1] = vaddq_f32(va.val[1], vb.val[1]);
vres.val[2] = vaddq_f32(va.val[2], vb.val[2]);

// Interleave and store back to memory
vst3q_f32((float *)&result[i], vres);
}

// Handle remainder (tail cleanup) if size is not divisible by 4
for (; i < size; i++)
{
// Load 4 Vector3 elements from both arrays (12 floats in total for each array)
float32x4_t ax = vld1q_f32(&a[i].x);
float32x4_t ay = vld1q_f32(&a[i].y);
float32x4_t az = vld1q_f32(&a[i].z);

float32x4_t bx = vld1q_f32(&b[i].x);
float32x4_t by = vld1q_f32(&b[i].y);
float32x4_t bz = vld1q_f32(&b[i].z);

// Add corresponding components using SIMD
float32x4_t rx = vaddq_f32(ax, bx);
float32x4_t ry = vaddq_f32(ay, by);
float32x4_t rz = vaddq_f32(az, bz);

// Store the result back into the result array
vst1q_f32(&result[i].x, rx);
vst1q_f32(&result[i].y, ry);
vst1q_f32(&result[i].z, rz);
result[i].x = a[i].x + b[i].x;
result[i].y = a[i].y + b[i].y;
result[i].z = a[i].z + b[i].z;
}
}

int main()
{
// Allocate memory for the two arrays and the result array
Vector3 *a = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
Vector3 *b = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
Vector3 *result = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
size_t data_size = ARRAY_SIZE * sizeof(Vector3);

// Using aligned_alloc for SIMD friendliness, though vld3 handles unaligned well on modern ARM
Vector3 *a = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);
Vector3 *b = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);
Vector3 *result = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);

// Initialize arrays with some data
if (!a || !b || !result) {
fprintf(stderr, "Memory allocation failed\n");
return 1;
}

// Initialize data
for (int i = 0; i < ARRAY_SIZE; i++)
{
a[i].x = i * 0.1f;
a[i].y = i * 0.2f;
a[i].z = i * 0.3f;
b[i].x = i * 0.4f;
b[i].y = i * 0.5f;
b[i].z = i * 0.6f;
float f = (float)i;
a[i] = (Vector3){f * 0.1f, f * 0.2f, f * 0.3f};
b[i] = (Vector3){f * 0.4f, f * 0.5f, f * 0.6f};
}

// Use volatile to prevent the compiler from optimizing away the result
// Sink to prevent optimization
volatile float sink = 0.0f;

// Perform the vector addition NUM_RUNS times and measure the time
printf("Benchmarking Vector3 Addition (NEON Optimized)...\n");
printf("Array Size: %d | Runs: %d\n", ARRAY_SIZE, NUM_RUNS);

double start_time = get_time();

for (int run = 0; run < NUM_RUNS; run++)
{
add_vectors(a, b, result, ARRAY_SIZE);
add_vectors_neon(a, b, result, ARRAY_SIZE);

// Force the compiler to keep the result
sink += result[0].x + result[0].y + result[0].z;
// Simple aggregation to prevent dead code elimination
// Only checking the first element to minimize overhead inside the benchmark loop
sink += result[0].x;
}

double end_time = get_time();

// Calculate elapsed time
double total_time = end_time - start_time;
double average_time = total_time / NUM_RUNS;

// Calculate MFLOPS
double total_flops = sizeof(Vector3) * ARRAY_SIZE * NUM_RUNS; // 3 operations per Vector3
double mflops_per_second = (total_flops / total_time) / 1e6;

printf("Average time per run: %f seconds\n", average_time);
printf("Total time: %f seconds\n", total_time);
printf("Performance: %f MFLOPS\n", mflops_per_second);
// FLOP Calculation: 3 additions per Vector3 (x+x, y+y, z+z)
double ops_per_run = 3.0 * ARRAY_SIZE;
double total_flops = ops_per_run * NUM_RUNS;
double mflops = (total_flops / total_time) / 1e6;

// Use sink to prevent dead-code elimination
printf("Sink: %f\n", sink);
printf("------------------------------------------------\n");
printf("Total Time : %f seconds\n", total_time);
printf("Avg Time/Run : %f seconds\n", average_time);
printf("Throughput : %.2f MFLOPS\n", mflops);
printf("Sink Value : %f\n", sink); // Verification

// Free allocated memory
free(a);
free(b);
free(result);

return 0;
}
}
Loading