adobe · devsarwan · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/assembly/index.ts b/assembly/index.ts
@@ -19,67 +19,125 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.*/
+
+/**
+ * Basic scalar addition.
+ */
 export function add(a: i32, b: i32): i32 {
   return a + b;
 }
 
+/**
+ * Standard scalar array addition using direct pointer arithmetic.
+ * Performs: A[i] = A[i] + B[i]
+ */
 export function addF32Arrays(aPtr: usize, bPtr: usize, length: u32): void {
-  // now handle any remaining elements
-  for (let i: u32 = 0 /*length & ~7*/; i < length; i++) {
-    const aOffset = aPtr + (i << 2); // i * 4 (each Float32 is 4 bytes)
-    const bOffset = bPtr + (i << 2);
+  const endPtr = aPtr + (<usize>length << 2); // end = start + (length * 4 bytes)
+
+  while (aPtr < endPtr) {
+    // Load values
+    const va = load<f32>(aPtr);
+    const vb = load<f32>(bPtr);
 
-    const va = f32.load(aOffset);
-    const vb = f32.load(bOffset);
+    // Store result
+    store<f32>(aPtr, va + vb);
 
-    f32.store(aOffset, va + vb);
+    // Advance pointers by 4 bytes (size of f32)
+    aPtr += 4;
+    bPtr += 4;
   }
 }
 
+/**
+ * SIMD optimized array addition.
+ * Processes 4 floats (128 bits) per instruction.
+ */
 export function addF32ArraysSimd4(aPtr: usize, bPtr: usize, length: u32): void {
-  const stride = 4; // Process 4 f32 elements per SIMD operation
-  let i: u32 = 0;
-  const end = length - stride;
+  // Ensure we have at least 4 elements to process
+  if (length >= 4) {
+    // Calculate the end boundary for the vector loop
+    // We floor the length to the nearest multiple of 4
+    const vectorLoopEnd = length & ~3; 
+    const endPtr = aPtr + (<usize>vectorLoopEnd << 2);
 
-  // Handle SIMD operations for chunks of 4 elements
-  for (; i < end; i += stride) {
-    v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
-    aPtr += stride * sizeof<f32>();
-    bPtr += stride * sizeof<f32>();
+    while (aPtr < endPtr) {
+      // Load 128-bit vectors (4 floats each)
+      const va = v128.load(aPtr);
+      const vb = v128.load(bPtr);
+
+      // Perform vector addition
+      const res = f32x4.add(va, vb);
+
+      // Store result back to A
+      v128.store(aPtr, res);
+
+      // Advance pointers by 16 bytes (size of v128)
+      aPtr += 16;
+      bPtr += 16;
+    }
+
+    // Update the remaining length for the scalar fallback
+    length -= vectorLoopEnd;
   }
 
-  // Handle remaining elements that aren't divisible by 4
-  for (; i < length; i++) {
+  // Handle remaining elements (0-3 elements)
+  // 
+  while (length > 0) {
     store<f32>(aPtr, load<f32>(aPtr) + load<f32>(bPtr));
-    aPtr += sizeof<f32>();
-    bPtr += sizeof<f32>();
+    aPtr += 4;
+    bPtr += 4;
+    length--;
   }
 }
 
+/**
+ * Unrolled SIMD array addition.
+ * Processes 8 floats per loop iteration (2 vectors) to reduce loop overhead.
+ */
 export function addF32ArraysSimd4Unrolled(aPtr: usize, bPtr: usize, length: u32): void {
-  const stride = 4; // Process 4 f32 elements per SIMD operation
-  const unrollFactor = 2; // Number of SIMD operations per loop iteration
-  const totalStride = stride * unrollFactor; // Total elements processed per loop iteration
-  let i: u32 = 0;
-  const end = length - (length % totalStride);
-
-  // Handle SIMD operations for chunks of totalStride elements
-  for (; i < end; i += totalStride) {
-    // First SIMD operation
-    v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
-    aPtr += stride * sizeof<f32>();
-    bPtr += stride * sizeof<f32>();
+  // We need at least 8 elements to enter the unrolled loop
+  if (length >= 8) {
+    const unrollMask = ~7; // Multiple of 8
+    const vectorLoopLen = length & unrollMask;
+    const endPtr = aPtr + (<usize>vectorLoopLen << 2);
+
+    while (aPtr < endPtr) {
+      // Load 2 vectors from A and 2 vectors from B
+      // Pipelining loads often helps hide memory latency
+      const va1 = v128.load(aPtr);
+      const vb1 = v128.load(bPtr);
+      const va2 = v128.load(aPtr, 16); // Load with offset 16
+      const vb2 = v128.load(bPtr, 16);
+
+      // Perform additions
+      const res1 = f32x4.add(va1, vb1);
+      const res2 = f32x4.add(va2, vb2);
+
+      // Store results
+      v128.store(aPtr, res1);
+      v128.store(aPtr, res2, 16); // Store with offset 16
 
-    // Second SIMD operation
+      // Advance pointers by 32 bytes (8 floats * 4 bytes)
+      aPtr += 32;
+      bPtr += 32;
+    }
+
+    length -= vectorLoopLen;
+  }
+
+  // Fallback: Try single SIMD block (4 elements) if enough remain
+  if (length >= 4) {
     v128.store(aPtr, f32x4.add(v128.load(aPtr), v128.load(bPtr)));
-    aPtr += stride * sizeof<f32>();
-    bPtr += stride * sizeof<f32>();
+    aPtr += 16;
+    bPtr += 16;
+    length -= 4;
   }
 
-  // Handle remaining elements that aren't divisible by totalStride
-  for (; i < length; i++) {
+  // Fallback: Handle final remaining scalars (0-3 elements)
+  while (length > 0) {
     store<f32>(aPtr, load<f32>(aPtr) + load<f32>(bPtr));
-    aPtr += sizeof<f32>();
-    bPtr += sizeof<f32>();
+    aPtr += 4;
+    bPtr += 4;
+    length--;
   }
-}
+}
diff --git a/assembly/tsconfig.json b/assembly/tsconfig.json
@@ -1,17 +1,18 @@
 {
+  "extends": "assemblyscript/std/assembly.json",
   "compilerOptions": {
     "target": "esnext",
     "module": "esnext",
+    "moduleResolution": "node",
     "noEmit": true,
-    "types": [],
-    "strict": false,
-    "noImplicitAny": false,
-    "noUnusedLocals": false,
-    "noUnusedParameters": false,
-    "experimentalDecorators": true
+    "strict": true,
+    "noImplicitAny": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "experimentalDecorators": true,
+    "types": []
   },
   "include": [
     "./**/*.ts"
-  ],
-  "extends": "../../node_modules/.pnpm/assemblyscript@0.27.30/node_modules/assemblyscript/std/assembly.json"
+  ]
 }
diff --git a/c/vector_addition.c b/c/vector_addition.c
@@ -3,8 +3,8 @@
  * ADOBE CONFIDENTIAL
  * ___________________
  *
- *  Copyright 2025 Adobe
- *  All Rights Reserved.
+ * Copyright 2025 Adobe
+ * All Rights Reserved.
  *
  * NOTICE: All information contained herein is, and remains
  * the property of Adobe and its suppliers, if any. The intellectual
@@ -20,106 +20,126 @@
 #include <stdlib.h>
 #include <arm_neon.h>
 #include <time.h>
-#include <pthread.h>
 
-// Vector3 struct to store three floats
+/*
+ * Optimization Note:
+ * Treating data as Array of Structures (AoS) - {x,y,z}, {x,y,z}...
+ * usually incurs a penalty compared to Structure of Arrays (SoA) - {x,x...}, {y,y...}.
+ * However, using NEON's `vld3q` / `vst3q` allows us to de-interleave on load
+ * and re-interleave on store effectively, mitigating the AoS penalty.
+ */
+
 typedef struct
 {
     float x, y, z;
 } Vector3;
 
 #define ARRAY_SIZE 250000
-#define NUM_RUNS 10
-
-//  When this runs with interleaved data we get  5400 MFLOPS
-//  When I run with non interleaved data we get 17000 MFLOPS
+#define NUM_RUNS 500 // Increased runs for more stable benchmarking
+#define ALIGNMENT 16 // 128-bit alignment
 
-// High precision timer function using clock_gettime with CLOCK_MONOTONIC_RAW
+// High precision timer
 double get_time()
 {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
     return ts.tv_sec + ts.tv_nsec * 1e-9;
 }
 
-void add_vectors(Vector3 *a, Vector3 *b, Vector3 *result, int size)
+/**
+ * Adds two arrays of Vector3 using NEON intrinsics.
+ * Uses vld3q/vst3q to handle the Interleaved (AoS) data layout.
+ */
+void add_vectors_neon(const Vector3 *restrict a, const Vector3 *restrict b, Vector3 *restrict result, int size)
 {
-    for (int i = 0; i < size; i += 4)
+    // Ensure we process 4 vectors per iteration
+    int i = 0;
+    for (; i <= size - 4; i += 4)
+    {
+        // Load 3-element structures de-interleaved into 3 registers (x, y, z)
+        // input: x0 y0 z0 x1 y1 z1 ...
+        // va.val[0] = {x0, x1, x2, x3}
+        // va.val[1] = {y0, y1, y2, y3}
+        // va.val[2] = {z0, z1, z2, z3}
+        float32x4x3_t va = vld3q_f32((const float *)&a[i]);
+        float32x4x3_t vb = vld3q_f32((const float *)&b[i]);
+        float32x4x3_t vres;
+
+        // Vectorized Addition
+        vres.val[0] = vaddq_f32(va.val[0], vb.val[0]);
+        vres.val[1] = vaddq_f32(va.val[1], vb.val[1]);
+        vres.val[2] = vaddq_f32(va.val[2], vb.val[2]);
+
+        // Interleave and store back to memory
+        vst3q_f32((float *)&result[i], vres);
+    }
+
+    // Handle remainder (tail cleanup) if size is not divisible by 4
+    for (; i < size; i++)
     {
-        // Load 4 Vector3 elements from both arrays (12 floats in total for each array)
-        float32x4_t ax = vld1q_f32(&a[i].x);
-        float32x4_t ay = vld1q_f32(&a[i].y);
-        float32x4_t az = vld1q_f32(&a[i].z);
-
-        float32x4_t bx = vld1q_f32(&b[i].x);
-        float32x4_t by = vld1q_f32(&b[i].y);
-        float32x4_t bz = vld1q_f32(&b[i].z);
-
-        // Add corresponding components using SIMD
-        float32x4_t rx = vaddq_f32(ax, bx);
-        float32x4_t ry = vaddq_f32(ay, by);
-        float32x4_t rz = vaddq_f32(az, bz);
-
-        // Store the result back into the result array
-        vst1q_f32(&result[i].x, rx);
-        vst1q_f32(&result[i].y, ry);
-        vst1q_f32(&result[i].z, rz);
+        result[i].x = a[i].x + b[i].x;
+        result[i].y = a[i].y + b[i].y;
+        result[i].z = a[i].z + b[i].z;
     }
 }
 
 int main()
 {
-    // Allocate memory for the two arrays and the result array
-    Vector3 *a = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
-    Vector3 *b = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
-    Vector3 *result = (Vector3 *)aligned_alloc(16, ARRAY_SIZE * sizeof(Vector3));
+    size_t data_size = ARRAY_SIZE * sizeof(Vector3);
+
+    // Using aligned_alloc for SIMD friendliness, though vld3 handles unaligned well on modern ARM
+    Vector3 *a = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);
+    Vector3 *b = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);
+    Vector3 *result = (Vector3 *)aligned_alloc(ALIGNMENT, data_size);
 
-    // Initialize arrays with some data
+    if (!a || !b || !result) {
+        fprintf(stderr, "Memory allocation failed\n");
+        return 1;
+    }
+
+    // Initialize data
     for (int i = 0; i < ARRAY_SIZE; i++)
     {
-        a[i].x = i * 0.1f;
-        a[i].y = i * 0.2f;
-        a[i].z = i * 0.3f;
-        b[i].x = i * 0.4f;
-        b[i].y = i * 0.5f;
-        b[i].z = i * 0.6f;
+        float f = (float)i;
+        a[i] = (Vector3){f * 0.1f, f * 0.2f, f * 0.3f};
+        b[i] = (Vector3){f * 0.4f, f * 0.5f, f * 0.6f};
     }
 
-    // Use volatile to prevent the compiler from optimizing away the result
+    // Sink to prevent optimization
     volatile float sink = 0.0f;
 
-    // Perform the vector addition NUM_RUNS times and measure the time
+    printf("Benchmarking Vector3 Addition (NEON Optimized)...\n");
+    printf("Array Size: %d | Runs: %d\n", ARRAY_SIZE, NUM_RUNS);
+
     double start_time = get_time();
 
     for (int run = 0; run < NUM_RUNS; run++)
     {
-        add_vectors(a, b, result, ARRAY_SIZE);
+        add_vectors_neon(a, b, result, ARRAY_SIZE);
 
-        // Force the compiler to keep the result
-        sink += result[0].x + result[0].y + result[0].z;
+        // Simple aggregation to prevent dead code elimination
+        // Only checking the first element to minimize overhead inside the benchmark loop
+        sink += result[0].x; 
     }
 
     double end_time = get_time();
-
-    // Calculate elapsed time
     double total_time = end_time - start_time;
     double average_time = total_time / NUM_RUNS;
 
-    // Calculate MFLOPS
-    double total_flops = sizeof(Vector3) * ARRAY_SIZE * NUM_RUNS; // 3 operations per Vector3
-    double mflops_per_second = (total_flops / total_time) / 1e6;
-
-    printf("Average time per run: %f seconds\n", average_time);
-    printf("Total time: %f seconds\n", total_time);
-    printf("Performance: %f MFLOPS\n", mflops_per_second);
+    // FLOP Calculation: 3 additions per Vector3 (x+x, y+y, z+z)
+    double ops_per_run = 3.0 * ARRAY_SIZE; 
+    double total_flops = ops_per_run * NUM_RUNS;
+    double mflops = (total_flops / total_time) / 1e6;
 
-    // Use sink to prevent dead-code elimination
-    printf("Sink: %f\n", sink);
+    printf("------------------------------------------------\n");
+    printf("Total Time   : %f seconds\n", total_time);
+    printf("Avg Time/Run : %f seconds\n", average_time);
+    printf("Throughput   : %.2f MFLOPS\n", mflops);
+    printf("Sink Value   : %f\n", sink); // Verification
 
-    // Free allocated memory
     free(a);
     free(b);
     free(result);
 
     return 0;
-}
+}