From 2afe7299cc47f2c33b34288db465339f20f1e16f Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Fri, 5 Jun 2026 12:10:11 +0000
Subject: [PATCH 1/2] adding F-contig layout support + other cleanups

---
 src/csrc/quadblas_interface.cpp |  11 +-
 src/csrc/umath/matmul.cpp       | 396 ++++++++++++++++++++------------
 tests/test_dot.py               | 138 +++++++++--
 3 files changed, 376 insertions(+), 169 deletions(-)
diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp
index f73ba9c..efdf5b8 100644
--- a/src/csrc/quadblas_interface.cpp
+++ b/src/csrc/quadblas_interface.cpp
@@ -46,6 +46,15 @@ qblas_gemv(char layout, char trans, size_t m, size_t n,
            Sleef_quad *beta, Sleef_quad *y, size_t incy)
 {
     if (m == 0 || n == 0) {
+        if (y && beta) {
+            int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c');
+            size_t y_len = do_trans ? n : m;
+            Sleef_quad zero = Sleef_cast_from_doubleq1(0.0);
+            int beta_zero = Sleef_icmpeqq1(*beta, zero);
+            for (size_t i = 0; i < y_len; i++) {
+                y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]);
+            }
+        }
         return 0;
     }
     if (!alpha || !A || !x || !beta || !y) {
@@ -66,7 +75,7 @@ qblas_gemm(char layout, char transa, char transb,
            Sleef_quad *B, size_t ldb,
            Sleef_quad *beta, Sleef_quad *C, size_t ldc)
 {
-    if (m == 0 || n == 0 || k == 0) {
+    if (m == 0 || n == 0) {
         return 0;
     }
     if (!alpha || !A || !B || !beta || !C) {
diff --git a/src/csrc/umath/matmul.cpp b/src/csrc/umath/matmul.cpp
index 165a2c8..c04fe06 100644
--- a/src/csrc/umath/matmul.cpp
+++ b/src/csrc/umath/matmul.cpp
@@ -16,6 +16,8 @@ extern "C" {
 #include "numpy/dtype_api.h"
 }
 
+#include <climits>
+
 #include "quad_common.h"
 #include "scalar.h"
 #include "dtype.h"
@@ -93,30 +95,89 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p)
     }
 }
 
+static int
+naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                          npy_intp const dimensions[], npy_intp const strides[],
+                          NpyAuxData *auxdata);
+
+static inline int
+matmul_stride_ok(npy_intp s)
+{
+    npy_intp elsize = (npy_intp)sizeof(Sleef_quad);
+    return s > 0 && (s % elsize == 0) && (s / elsize) <= INT_MAX;
+}
+
+static inline int
+is_blasable2d(npy_intp s_slow, npy_intp s_fast, npy_intp d_slow, npy_intp d_fast)
+{
+    npy_intp elsize = (npy_intp)sizeof(Sleef_quad);
+    if (s_fast != elsize) {
+        return 0;
+    }
+    npy_intp unit_slow = s_slow / elsize;
+    return (s_slow % elsize == 0) && (unit_slow >= d_fast) && (unit_slow <= INT_MAX);
+}
+
+static int
+matmul_alloc_count(npy_intp a, npy_intp b, size_t *out)
+{
+    if (a < 0 || b < 0) {
+        return -1;
+    }
+    if (a != 0 && b > NPY_MAX_INTP / a) {
+        return -1;
+    }
+    npy_intp count = a * b;
+    if ((size_t)count > ((size_t)-1) / sizeof(Sleef_quad)) {
+        return -1;
+    }
+    *out = (size_t)count;
+    return 0;
+}
+
+static void
+matmul_gather(Sleef_quad *dst, const char *src, npy_intp rows, npy_intp cols,
+              npy_intp row_stride, npy_intp col_stride)
+{
+    for (npy_intp i = 0; i < rows; i++) {
+        const char *row = src + i * row_stride;
+        for (npy_intp j = 0; j < cols; j++) {
+            memcpy(&dst[i * cols + j], row + j * col_stride, sizeof(Sleef_quad));
+        }
+    }
+}
+
+static void
+matmul_scatter(char *dst, const Sleef_quad *src, npy_intp rows, npy_intp cols,
+               npy_intp row_stride, npy_intp col_stride)
+{
+    for (npy_intp i = 0; i < rows; i++) {
+        char *row = dst + i * row_stride;
+        for (npy_intp j = 0; j < cols; j++) {
+            memcpy(row + j * col_stride, &src[i * cols + j], sizeof(Sleef_quad));
+        }
+    }
+}
+
 static int
 quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
                                  npy_intp const dimensions[], npy_intp const strides[],
                                  NpyAuxData *auxdata)
 {
-    // Outer (broadcast / batch) loop length and core matrix dims.
     npy_intp N = dimensions[0];
     npy_intp m = dimensions[1];
     npy_intp n = dimensions[2];
     npy_intp p = dimensions[3];
 
-    // Strides between batches for each operand.
     npy_intp A_stride = strides[0];
     npy_intp B_stride = strides[1];
     npy_intp C_stride = strides[2];
-
-    // Strides within a single core matrix.
     npy_intp A_row_stride = strides[3];
     npy_intp A_col_stride = strides[4];
     npy_intp B_row_stride = strides[5];
-    // B_col_stride / C_col_stride are unused in this branch (output is
-    // assumed row-major contiguous within each batch, matching the QBLAS
-    // call shape).
+    npy_intp B_col_stride = strides[6];
     npy_intp C_row_stride = strides[7];
+    npy_intp C_col_stride = strides[8];
 
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     if (descr->backend != BACKEND_SLEEF) {
@@ -126,192 +187,223 @@ quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const dat
         return -1;
     }
 
+    if (m > INT_MAX || n > INT_MAX || p > INT_MAX || m == 0 || n == 0 || p == 0) {
+        return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata);
+    }
+
     MatmulOperationType op_type = determine_operation_type(m, n, p);
     Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
     Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
+    const npy_intp elsize = (npy_intp)sizeof(Sleef_quad);
 
     char *A_base = data[0];
     char *B_base = data[1];
     char *C_base = data[2];
 
-    for (npy_intp batch = 0; batch < N; batch++) {
-        Sleef_quad *A_ptr = (Sleef_quad *)(A_base + batch * A_stride);
-        Sleef_quad *B_ptr = (Sleef_quad *)(B_base + batch * B_stride);
-        Sleef_quad *C_ptr = (Sleef_quad *)(C_base + batch * C_stride);
-
-        int result = -1;
-
-        switch (op_type) {
-            case MATMUL_DOT: {
-                size_t incx = A_col_stride / sizeof(Sleef_quad);
-                size_t incy = B_row_stride / sizeof(Sleef_quad);
-
-                result = qblas_dot(n, A_ptr, incx, B_ptr, incy, C_ptr);
-                break;
+    if (op_type == MATMUL_DOT) {
+        if (!matmul_stride_ok(A_col_stride) || !matmul_stride_ok(B_row_stride)) {
+            return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata);
+        }
+        size_t incx = (size_t)(A_col_stride / elsize);
+        size_t incy = (size_t)(B_row_stride / elsize);
+        for (npy_intp batch = 0; batch < N; batch++) {
+            Sleef_quad *A_ptr = (Sleef_quad *)(A_base + batch * A_stride);
+            Sleef_quad *B_ptr = (Sleef_quad *)(B_base + batch * B_stride);
+            Sleef_quad *C_ptr = (Sleef_quad *)(C_base + batch * C_stride);
+            if (qblas_dot((size_t)n, A_ptr, incx, B_ptr, incy, C_ptr) != 0) {
+                PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
+                return -1;
             }
+        }
+        return 0;
+    }
 
-            case MATMUL_GEMV: {
-                size_t lda = A_row_stride / sizeof(Sleef_quad);
-                size_t incx = B_row_stride / sizeof(Sleef_quad);
-                size_t incy = C_row_stride / sizeof(Sleef_quad);
-
-                memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+    if (op_type == MATMUL_GEMV) {
+        char transa;
+        size_t lda, gemv_rows, gemv_cols;
+        int A_copy = 0;
+        if (is_blasable2d(A_row_stride, A_col_stride, m, n)) {
+            transa = 'N';
+            gemv_rows = (size_t)m;
+            gemv_cols = (size_t)n;
+            lda = (size_t)(A_row_stride / elsize);
+        }
+        else if (is_blasable2d(A_col_stride, A_row_stride, n, m)) {
+            transa = 'T';
+            gemv_rows = (size_t)n;
+            gemv_cols = (size_t)m;
+            lda = (size_t)(A_col_stride / elsize);
+        }
+        else {
+            transa = 'N';
+            gemv_rows = (size_t)m;
+            gemv_cols = (size_t)n;
+            lda = (size_t)n;
+            A_copy = 1;
+        }
 
-                result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta,
-                                    C_ptr, incy);
-                break;
-            }
+        int x_copy = !matmul_stride_ok(B_row_stride);
+        int y_copy = !matmul_stride_ok(C_row_stride);
+        size_t incx = x_copy ? 1 : (size_t)(B_row_stride / elsize);
+        size_t incy = y_copy ? 1 : (size_t)(C_row_stride / elsize);
 
-            case MATMUL_GEMM: {
-                size_t lda = A_row_stride / sizeof(Sleef_quad);
-                size_t ldb = B_row_stride / sizeof(Sleef_quad);
-                size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad);
+        size_t cntA = 0;
+        if (A_copy && matmul_alloc_count(m, n, &cntA) != 0) {
+            return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata);
+        }
 
-                memset(C_ptr, 0, m * p * sizeof(Sleef_quad));
+        Sleef_quad *tA = NULL, *tx = NULL, *ty = NULL;
+        if (A_copy) tA = (Sleef_quad *)PyMem_RawMalloc(cntA * sizeof(Sleef_quad));
+        if (x_copy) tx = (Sleef_quad *)PyMem_RawMalloc((size_t)n * sizeof(Sleef_quad));
+        if (y_copy) ty = (Sleef_quad *)PyMem_RawMalloc((size_t)m * sizeof(Sleef_quad));
+        if ((A_copy && !tA) || (x_copy && !tx) || (y_copy && !ty)) {
+            PyMem_RawFree(tA);
+            PyMem_RawFree(tx);
+            PyMem_RawFree(ty);
+            PyErr_NoMemory();
+            return -1;
+        }
 
-                result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb,
-                                    &beta, C_ptr, ldc_numpy);
+        int result = 0;
+        for (npy_intp batch = 0; batch < N; batch++) {
+            char *A_b = A_base + batch * A_stride;
+            char *B_b = B_base + batch * B_stride;
+            char *C_b = C_base + batch * C_stride;
+            Sleef_quad *A_use = (Sleef_quad *)A_b;
+            Sleef_quad *x_use = (Sleef_quad *)B_b;
+            Sleef_quad *y_use = (Sleef_quad *)C_b;
+            if (A_copy) {
+                matmul_gather(tA, A_b, m, n, A_row_stride, A_col_stride);
+                A_use = tA;
+            }
+            if (x_copy) {
+                matmul_gather(tx, B_b, n, 1, B_row_stride, elsize);
+                x_use = tx;
+            }
+            if (y_copy) {
+                y_use = ty;
+            }
+            result = qblas_gemv('R', transa, gemv_rows, gemv_cols, &alpha, A_use, lda,
+                                x_use, incx, &beta, y_use, incy);
+            if (result == 0 && y_copy) {
+                matmul_scatter(C_b, ty, m, 1, C_row_stride, elsize);
+            }
+            if (result != 0) {
                 break;
             }
-
-            default:
-                PyErr_SetString(PyExc_RuntimeError,
-                                "internal error: unknown MatmulOperationType");
-                return -1;
         }
-
+        PyMem_RawFree(tA);
+        PyMem_RawFree(tx);
+        PyMem_RawFree(ty);
         if (result != 0) {
             PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
             return -1;
         }
+        return 0;
     }
 
-    return 0;
-}
-
-static int
-quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                   npy_intp const dimensions[], npy_intp const strides[],
-                                   NpyAuxData *auxdata)
-{
-    // Outer (broadcast / batch) loop length and core matrix dims.
-    npy_intp N = dimensions[0];
-    npy_intp m = dimensions[1];
-    npy_intp n = dimensions[2];
-    npy_intp p = dimensions[3];
-
-    // Strides between batches for each operand.
-    npy_intp A_stride = strides[0];
-    npy_intp B_stride = strides[1];
-    npy_intp C_stride = strides[2];
-
-    // Core strides aren't needed here: this branch copies each batch into
-    // a contiguous temp buffer before passing it to QBLAS.
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    if (descr->backend != BACKEND_SLEEF) {
-        PyErr_SetString(PyExc_NotImplementedError,
-                        "QBLAS-accelerated matmul only supports SLEEF backend. "
-                        "Please raise the issue at SwayamInSync/QBLAS for longdouble support");
-        return -1;
+    char transa, transb;
+    size_t lda, ldb, ldc;
+    int A_copy = 0, B_copy = 0, C_copy = 0;
+    if (is_blasable2d(A_row_stride, A_col_stride, m, n)) {
+        transa = 'N';
+        lda = (size_t)(A_row_stride / elsize);
+    }
+    else if (is_blasable2d(A_col_stride, A_row_stride, n, m)) {
+        transa = 'T';
+        lda = (size_t)(A_col_stride / elsize);
+    }
+    else {
+        transa = 'N';
+        lda = (size_t)n;
+        A_copy = 1;
+    }
+    if (is_blasable2d(B_row_stride, B_col_stride, n, p)) {
+        transb = 'N';
+        ldb = (size_t)(B_row_stride / elsize);
+    }
+    else if (is_blasable2d(B_col_stride, B_row_stride, p, n)) {
+        transb = 'T';
+        ldb = (size_t)(B_col_stride / elsize);
+    }
+    else {
+        transb = 'N';
+        ldb = (size_t)p;
+        B_copy = 1;
+    }
+    if (is_blasable2d(C_row_stride, C_col_stride, m, p)) {
+        ldc = (size_t)(C_row_stride / elsize);
+    }
+    else {
+        ldc = (size_t)p;
+        C_copy = 1;
     }
 
-    MatmulOperationType op_type = determine_operation_type(m, n, p);
-    Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0);
-    Sleef_quad beta = Sleef_cast_from_doubleq1(0.0);
-
-    char *A_base = data[0];
-    char *B_base = data[1];
-    char *C_base = data[2];
-
-    // Allocate the contiguous temp buffers once, reuse them across batches.
-    Sleef_quad *temp_A_buffer = nullptr;
-    Sleef_quad *temp_B_buffer = nullptr;
-    Sleef_quad *temp_C_buffer = nullptr;
+    size_t cntA = 0, cntB = 0, cntC = 0;
+    if ((A_copy && matmul_alloc_count(m, n, &cntA) != 0) ||
+        (B_copy && matmul_alloc_count(n, p, &cntB) != 0) ||
+        (C_copy && matmul_alloc_count(m, p, &cntC) != 0)) {
+        return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata);
+    }
 
-    switch (op_type) {
-        case MATMUL_DOT:
-            temp_A_buffer = new Sleef_quad[n];
-            temp_B_buffer = new Sleef_quad[n];
-            break;
-        case MATMUL_GEMV:
-        case MATMUL_GEMM:
-            temp_A_buffer = new Sleef_quad[m * n];
-            temp_B_buffer = new Sleef_quad[n * p];
-            temp_C_buffer = new Sleef_quad[m * p];
-            break;
-        default:
-            PyErr_SetString(PyExc_RuntimeError,
-                            "internal error: unknown MatmulOperationType");
-            return -1;
+    Sleef_quad *tA = NULL, *tB = NULL, *tC = NULL;
+    if (A_copy) tA = (Sleef_quad *)PyMem_RawMalloc(cntA * sizeof(Sleef_quad));
+    if (B_copy) tB = (Sleef_quad *)PyMem_RawMalloc(cntB * sizeof(Sleef_quad));
+    if (C_copy) tC = (Sleef_quad *)PyMem_RawMalloc(cntC * sizeof(Sleef_quad));
+    if ((A_copy && !tA) || (B_copy && !tB) || (C_copy && !tC)) {
+        PyMem_RawFree(tA);
+        PyMem_RawFree(tB);
+        PyMem_RawFree(tC);
+        PyErr_NoMemory();
+        return -1;
     }
 
     int result = 0;
     for (npy_intp batch = 0; batch < N; batch++) {
-        Sleef_quad *A_batch = (Sleef_quad *)(A_base + batch * A_stride);
-        Sleef_quad *B_batch = (Sleef_quad *)(B_base + batch * B_stride);
-        Sleef_quad *C_batch = (Sleef_quad *)(C_base + batch * C_stride);
-
-        switch (op_type) {
-            case MATMUL_DOT: {
-                memcpy(temp_A_buffer, A_batch, n * sizeof(Sleef_quad));
-                memcpy(temp_B_buffer, B_batch, n * sizeof(Sleef_quad));
-                result = qblas_dot(n, temp_A_buffer, 1, temp_B_buffer, 1, C_batch);
-                break;
-            }
-
-            case MATMUL_GEMV: {
-                memcpy(temp_A_buffer, A_batch, m * n * sizeof(Sleef_quad));
-                memcpy(temp_B_buffer, B_batch, n * p * sizeof(Sleef_quad));
-                memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
-
-                result = qblas_gemv('R', 'N', m, n, &alpha, temp_A_buffer, n,
-                                    temp_B_buffer, 1, &beta, temp_C_buffer, 1);
-                if (result == 0) {
-                    memcpy(C_batch, temp_C_buffer, m * p * sizeof(Sleef_quad));
-                }
-                break;
-            }
-
-            case MATMUL_GEMM: {
-                memcpy(temp_A_buffer, A_batch, m * n * sizeof(Sleef_quad));
-                memcpy(temp_B_buffer, B_batch, n * p * sizeof(Sleef_quad));
-                memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad));
-
-                result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, temp_A_buffer, n,
-                                    temp_B_buffer, p, &beta, temp_C_buffer, p);
-                if (result == 0) {
-                    memcpy(C_batch, temp_C_buffer, m * p * sizeof(Sleef_quad));
-                }
-                break;
-            }
-
-            default:
-                PyErr_SetString(PyExc_RuntimeError,
-                                "internal error: unknown MatmulOperationType");
-                result = -1;
-                break;
+        char *A_b = A_base + batch * A_stride;
+        char *B_b = B_base + batch * B_stride;
+        char *C_b = C_base + batch * C_stride;
+        Sleef_quad *A_use = (Sleef_quad *)A_b;
+        Sleef_quad *B_use = (Sleef_quad *)B_b;
+        Sleef_quad *C_use = (Sleef_quad *)C_b;
+        if (A_copy) {
+            matmul_gather(tA, A_b, m, n, A_row_stride, A_col_stride);
+            A_use = tA;
+        }
+        if (B_copy) {
+            matmul_gather(tB, B_b, n, p, B_row_stride, B_col_stride);
+            B_use = tB;
+        }
+        if (C_copy) {
+            C_use = tC;
+        }
+        result = qblas_gemm('R', transa, transb, (size_t)m, (size_t)p, (size_t)n,
+                            &alpha, A_use, lda, B_use, ldb, &beta, C_use, ldc);
+        if (result == 0 && C_copy) {
+            matmul_scatter(C_b, tC, m, p, C_row_stride, C_col_stride);
         }
-
         if (result != 0) {
             break;
         }
     }
-
-    delete[] temp_A_buffer;
-    delete[] temp_B_buffer;
-    delete[] temp_C_buffer;  // delete[] on nullptr is a no-op
-
+    PyMem_RawFree(tA);
+    PyMem_RawFree(tB);
+    PyMem_RawFree(tC);
     if (result != 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
-        }
+        PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed");
         return -1;
     }
-
     return 0;
 }
 
+static int
+quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                   npy_intp const dimensions[], npy_intp const strides[],
+                                   NpyAuxData *auxdata)
+{
+    return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata);
+}
+
 static int
 naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
                           npy_intp const dimensions[], npy_intp const strides[],
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 76dd857..85b26b9 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -941,15 +941,10 @@ def test_1d_at_3d_collapsed_vecmat(self):
 
     # ---------- Non-contiguous / Fortran-order batched matrices ----------
 
-    @pytest.mark.xfail(
-        reason="F-contiguous matmul is broken even at 2D — separate pre-existing "
-               "bug in the QBLAS dispatch path (treats col-major data as row-major). "
-               "Not related to the batch-loop fix; tracked separately."
-    )
     def test_batched_fortran_order(self):
-        """F-contiguous batched arrays. Currently xfails due to a separate
-        pre-existing bug in the matmul aligned strided loop: it always passes
-        layout 'R' to qblas_gemm regardless of stride pattern."""
+        """F-contiguous batched arrays (issue #89). The matmul loop now derives
+        the QBLAS transpose flag from each operand's strides instead of always
+        passing layout 'R', so col-major data is handled correctly."""
         rng = np.random.default_rng(seed=20)
         A_f = np.asfortranarray(rng.standard_normal((3, 4, 5)))
         B_f = np.asfortranarray(rng.standard_normal((3, 5, 6)))
@@ -969,16 +964,11 @@ def test_batched_non_contiguous_slice(self):
         B_q = _qnd(B_full_f)[::2]
         _assert_matmul_matches_float64(A_q, B_q, A_f, B_f)
 
-    @pytest.mark.xfail(
-        reason="A.swapaxes view yields a per-batch col-major matrix; same "
-               "pre-existing layout-dispatch bug as test_batched_fortran_order. "
-               "Not related to the batch-loop fix; tracked separately."
-    )
     def test_batched_transposed_view(self):
-        """A.swapaxes-based view of a 3D array as the batched matrix.
+        """A.swapaxes-based view of a 3D array as the batched matrix (issue #89).
 
-        After swapping the last two axes, each batch is col-major. Like
-        F-order arrays, this hits the same pre-existing layout-dispatch bug.
+        After swapping the last two axes, each batch is col-major; the loop now
+        passes transa='T' for it rather than mis-reading it as row-major.
         """
         rng = np.random.default_rng(seed=22)
         A_full_f = rng.standard_normal((3, 5, 4))  # shape (B, K, M)
@@ -1054,3 +1044,119 @@ def test_batched_larger(self, batch, m, k, n):
         B_f = rng.standard_normal((batch, k, n))
         _assert_matmul_matches_float64(_qnd(A_f), _qnd(B_f), A_f, B_f,
                                        rtol=1e-13, atol=1e-13)
+
+
+class TestMatmulLayouts:
+    """Issue #89: matmul must give correct results for every memory layout, not
+    just row-major. Inputs are dispatched to QBLAS with the transpose flag
+    derived from each operand's strides, and non-BLAS-able operands fall back to
+    the naive loop. Every case is cross-checked against float64."""
+
+    @staticmethod
+    def _layout(a, kind):
+        if kind == "C":
+            return np.ascontiguousarray(a)
+        if kind == "F":
+            return np.asfortranarray(a)
+        if kind == "T":
+            return np.ascontiguousarray(a.T).T
+        if kind == "S":
+            big = np.zeros((a.shape[0] * 2, a.shape[1] * 2), dtype=a.dtype)
+            big[::2, ::2] = a
+            return big[::2, ::2]
+        raise ValueError(kind)
+
+    @pytest.mark.parametrize("a_kind", ["C", "F", "T", "S"])
+    @pytest.mark.parametrize("b_kind", ["C", "F", "T", "S"])
+    def test_gemm_all_layout_combinations(self, a_kind, b_kind):
+        rng = np.random.default_rng(seed=100)
+        A_f = rng.standard_normal((4, 3))
+        B_f = rng.standard_normal((3, 5))
+        A_q = self._layout(_qnd(A_f), a_kind)
+        B_q = self._layout(_qnd(B_f), b_kind)
+        _assert_matmul_matches_float64(A_q, B_q, A_f, B_f)
+
+    @pytest.mark.parametrize("out_kind", ["C", "F"])
+    def test_gemm_output_layout(self, out_kind):
+        rng = np.random.default_rng(seed=101)
+        A_f = rng.standard_normal((4, 3))
+        B_f = rng.standard_normal((3, 5))
+        A_q, B_q = _qnd(A_f), _qnd(B_f)
+        ref = A_f @ B_f
+        out = np.zeros((4, 5), dtype=QuadPrecDType())
+        if out_kind == "F":
+            out = np.asfortranarray(out)
+        np.matmul(A_q, B_q, out=out)
+        for i in range(4):
+            for j in range(5):
+                assert abs(float(out[i, j]) - ref[i, j]) <= 1e-13 + 1e-13 * max(abs(ref[i, j]), 1.0)
+
+    @pytest.mark.parametrize("a_kind", ["C", "F", "T", "S"])
+    def test_gemv_all_layouts(self, a_kind):
+        rng = np.random.default_rng(seed=102)
+        A_f = rng.standard_normal((5, 4))
+        x_f = rng.standard_normal((4,))
+        A_q = self._layout(_qnd(A_f), a_kind)
+        _assert_matmul_matches_float64(A_q, _qnd(x_f), A_f, x_f)
+
+    @pytest.mark.parametrize("stride", [2, 3])
+    def test_dot_strided_vectors(self, stride):
+        rng = np.random.default_rng(seed=103)
+        x_f = rng.standard_normal((7 * stride,))
+        y_f = rng.standard_normal((7 * stride,))
+        _assert_matmul_matches_float64(_qnd(x_f)[::stride], _qnd(y_f)[::stride],
+                                       x_f[::stride], y_f[::stride])
+
+    @pytest.mark.parametrize("A_shape,B_shape", [
+        ((0, 3), (3, 4)),
+        ((4, 0), (0, 5)),
+        ((3, 4), (4, 0)),
+    ])
+    def test_empty_core_dims_match_float64(self, A_shape, B_shape):
+        A_f = np.ones(A_shape)
+        B_f = np.ones(B_shape)
+        _assert_matmul_matches_float64(_qnd(A_f), _qnd(B_f), A_f, B_f)
+
+
+class TestMatmulCopyPath:
+    """Issue #89 Tier 2: layouts that are not directly BLAS-able (fully strided,
+    negative strides, non-row-major output) are copied into contiguous temps and
+    still routed through QBLAS rather than the naive loop. All cross-checked
+    against float64."""
+
+    def test_fully_strided_both_inputs(self):
+        rng = np.random.default_rng(seed=200)
+        A_f = rng.standard_normal((20, 16))
+        B_f = rng.standard_normal((16, 24))
+        _assert_matmul_matches_float64(_qnd(A_f)[::2, ::2], _qnd(B_f)[::2, ::2],
+                                       A_f[::2, ::2], B_f[::2, ::2])
+
+    def test_negative_stride_input(self):
+        rng = np.random.default_rng(seed=201)
+        A_f = rng.standard_normal((6, 5))
+        B_f = rng.standard_normal((5, 7))
+        _assert_matmul_matches_float64(_qnd(A_f)[::-1], _qnd(B_f), A_f[::-1], B_f)
+
+    def test_strided_A_fortran_B_fortran_out_three_temps(self):
+        rng = np.random.default_rng(seed=202)
+        A_f = rng.standard_normal((20, 7))
+        B_f = rng.standard_normal((7, 11))
+        out = np.asfortranarray(np.zeros((10, 11), dtype=QuadPrecDType()))
+        np.matmul(_qnd(A_f)[::2], np.asfortranarray(_qnd(B_f)), out=out)
+        ref = A_f[::2] @ B_f
+        for i in range(10):
+            for j in range(11):
+                assert abs(float(out[i, j]) - ref[i, j]) <= 1e-12 + 1e-12 * max(abs(ref[i, j]), 1.0)
+
+    def test_gemv_fully_strided_matrix(self):
+        rng = np.random.default_rng(seed=203)
+        A_f = rng.standard_normal((12, 10))
+        x_f = rng.standard_normal((10,))
+        _assert_matmul_matches_float64(_qnd(A_f)[::2, ::2], _qnd(x_f)[:5],
+                                       A_f[::2, ::2], x_f[:5])
+
+    def test_gemv_negative_stride_vector(self):
+        rng = np.random.default_rng(seed=204)
+        A_f = rng.standard_normal((5, 6))
+        x_f = rng.standard_normal((6,))
+        _assert_matmul_matches_float64(_qnd(A_f), _qnd(x_f)[::-1], A_f, x_f[::-1])

From 784a75fd86255e82dee42549d1e497a6d718ce2c Mon Sep 17 00:00:00 2001
From: SwayamInSync <hawkempire007@gmail.com>
Date: Fri, 5 Jun 2026 18:27:48 +0000
Subject: [PATCH 2/2] gemv null handling

---
 src/csrc/quadblas_interface.cpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp
index efdf5b8..0335e74 100644
--- a/src/csrc/quadblas_interface.cpp
+++ b/src/csrc/quadblas_interface.cpp
@@ -46,14 +46,18 @@ qblas_gemv(char layout, char trans, size_t m, size_t n,
            Sleef_quad *beta, Sleef_quad *y, size_t incy)
 {
     if (m == 0 || n == 0) {
-        if (y && beta) {
-            int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c');
-            size_t y_len = do_trans ? n : m;
-            Sleef_quad zero = Sleef_cast_from_doubleq1(0.0);
-            int beta_zero = Sleef_icmpeqq1(*beta, zero);
-            for (size_t i = 0; i < y_len; i++) {
-                y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]);
-            }
+        int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c');
+        size_t y_len = do_trans ? n : m;
+        if (y_len == 0) {
+            return 0;
+        }
+        if (!y || !beta) {
+            return -1;
+        }
+        Sleef_quad zero = Sleef_cast_from_doubleq1(0.0);
+        int beta_zero = Sleef_icmpeqq1(*beta, zero);
+        for (size_t i = 0; i < y_len; i++) {
+            y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]);
         }
         return 0;
     }