From 2afe7299cc47f2c33b34288db465339f20f1e16f Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Fri, 5 Jun 2026 12:10:11 +0000 Subject: [PATCH 1/2] adding F-contig layout support + other cleanups --- src/csrc/quadblas_interface.cpp | 11 +- src/csrc/umath/matmul.cpp | 396 ++++++++++++++++++++------------ tests/test_dot.py | 138 +++++++++-- 3 files changed, 376 insertions(+), 169 deletions(-) diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp index f73ba9c..efdf5b8 100644 --- a/src/csrc/quadblas_interface.cpp +++ b/src/csrc/quadblas_interface.cpp @@ -46,6 +46,15 @@ qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *beta, Sleef_quad *y, size_t incy) { if (m == 0 || n == 0) { + if (y && beta) { + int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c'); + size_t y_len = do_trans ? n : m; + Sleef_quad zero = Sleef_cast_from_doubleq1(0.0); + int beta_zero = Sleef_icmpeqq1(*beta, zero); + for (size_t i = 0; i < y_len; i++) { + y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]); + } + } return 0; } if (!alpha || !A || !x || !beta || !y) { @@ -66,7 +75,7 @@ qblas_gemm(char layout, char transa, char transb, Sleef_quad *B, size_t ldb, Sleef_quad *beta, Sleef_quad *C, size_t ldc) { - if (m == 0 || n == 0 || k == 0) { + if (m == 0 || n == 0) { return 0; } if (!alpha || !A || !B || !beta || !C) { diff --git a/src/csrc/umath/matmul.cpp b/src/csrc/umath/matmul.cpp index 165a2c8..c04fe06 100644 --- a/src/csrc/umath/matmul.cpp +++ b/src/csrc/umath/matmul.cpp @@ -16,6 +16,8 @@ extern "C" { #include "numpy/dtype_api.h" } +#include + #include "quad_common.h" #include "scalar.h" #include "dtype.h" @@ -93,30 +95,89 @@ determine_operation_type(npy_intp m, npy_intp n, npy_intp p) } } +static int +naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata); + +static inline int +matmul_stride_ok(npy_intp s) +{ + npy_intp elsize = (npy_intp)sizeof(Sleef_quad); + return s > 0 && (s % elsize == 0) && (s / elsize) <= INT_MAX; +} + +static inline int +is_blasable2d(npy_intp s_slow, npy_intp s_fast, npy_intp d_slow, npy_intp d_fast) +{ + npy_intp elsize = (npy_intp)sizeof(Sleef_quad); + if (s_fast != elsize) { + return 0; + } + npy_intp unit_slow = s_slow / elsize; + return (s_slow % elsize == 0) && (unit_slow >= d_fast) && (unit_slow <= INT_MAX); +} + +static int +matmul_alloc_count(npy_intp a, npy_intp b, size_t *out) +{ + if (a < 0 || b < 0) { + return -1; + } + if (a != 0 && b > NPY_MAX_INTP / a) { + return -1; + } + npy_intp count = a * b; + if ((size_t)count > ((size_t)-1) / sizeof(Sleef_quad)) { + return -1; + } + *out = (size_t)count; + return 0; +} + +static void +matmul_gather(Sleef_quad *dst, const char *src, npy_intp rows, npy_intp cols, + npy_intp row_stride, npy_intp col_stride) +{ + for (npy_intp i = 0; i < rows; i++) { + const char *row = src + i * row_stride; + for (npy_intp j = 0; j < cols; j++) { + memcpy(&dst[i * cols + j], row + j * col_stride, sizeof(Sleef_quad)); + } + } +} + +static void +matmul_scatter(char *dst, const Sleef_quad *src, npy_intp rows, npy_intp cols, + npy_intp row_stride, npy_intp col_stride) +{ + for (npy_intp i = 0; i < rows; i++) { + char *row = dst + i * row_stride; + for (npy_intp j = 0; j < cols; j++) { + memcpy(row + j * col_stride, &src[i * cols + j], sizeof(Sleef_quad)); + } + } +} + static int quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata) { - // Outer (broadcast / batch) loop length and core matrix dims. npy_intp N = dimensions[0]; npy_intp m = dimensions[1]; npy_intp n = dimensions[2]; npy_intp p = dimensions[3]; - // Strides between batches for each operand. npy_intp A_stride = strides[0]; npy_intp B_stride = strides[1]; npy_intp C_stride = strides[2]; - - // Strides within a single core matrix. npy_intp A_row_stride = strides[3]; npy_intp A_col_stride = strides[4]; npy_intp B_row_stride = strides[5]; - // B_col_stride / C_col_stride are unused in this branch (output is - // assumed row-major contiguous within each batch, matching the QBLAS - // call shape). + npy_intp B_col_stride = strides[6]; npy_intp C_row_stride = strides[7]; + npy_intp C_col_stride = strides[8]; QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; if (descr->backend != BACKEND_SLEEF) { @@ -126,192 +187,223 @@ quad_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const dat return -1; } + if (m > INT_MAX || n > INT_MAX || p > INT_MAX || m == 0 || n == 0 || p == 0) { + return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata); + } + MatmulOperationType op_type = determine_operation_type(m, n, p); Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); + const npy_intp elsize = (npy_intp)sizeof(Sleef_quad); char *A_base = data[0]; char *B_base = data[1]; char *C_base = data[2]; - for (npy_intp batch = 0; batch < N; batch++) { - Sleef_quad *A_ptr = (Sleef_quad *)(A_base + batch * A_stride); - Sleef_quad *B_ptr = (Sleef_quad *)(B_base + batch * B_stride); - Sleef_quad *C_ptr = (Sleef_quad *)(C_base + batch * C_stride); - - int result = -1; - - switch (op_type) { - case MATMUL_DOT: { - size_t incx = A_col_stride / sizeof(Sleef_quad); - size_t incy = B_row_stride / sizeof(Sleef_quad); - - result = qblas_dot(n, A_ptr, incx, B_ptr, incy, C_ptr); - break; + if (op_type == MATMUL_DOT) { + if (!matmul_stride_ok(A_col_stride) || !matmul_stride_ok(B_row_stride)) { + return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata); + } + size_t incx = (size_t)(A_col_stride / elsize); + size_t incy = (size_t)(B_row_stride / elsize); + for (npy_intp batch = 0; batch < N; batch++) { + Sleef_quad *A_ptr = (Sleef_quad *)(A_base + batch * A_stride); + Sleef_quad *B_ptr = (Sleef_quad *)(B_base + batch * B_stride); + Sleef_quad *C_ptr = (Sleef_quad *)(C_base + batch * C_stride); + if (qblas_dot((size_t)n, A_ptr, incx, B_ptr, incy, C_ptr) != 0) { + PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); + return -1; } + } + return 0; + } - case MATMUL_GEMV: { - size_t lda = A_row_stride / sizeof(Sleef_quad); - size_t incx = B_row_stride / sizeof(Sleef_quad); - size_t incy = C_row_stride / sizeof(Sleef_quad); - - memset(C_ptr, 0, m * p * sizeof(Sleef_quad)); + if (op_type == MATMUL_GEMV) { + char transa; + size_t lda, gemv_rows, gemv_cols; + int A_copy = 0; + if (is_blasable2d(A_row_stride, A_col_stride, m, n)) { + transa = 'N'; + gemv_rows = (size_t)m; + gemv_cols = (size_t)n; + lda = (size_t)(A_row_stride / elsize); + } + else if (is_blasable2d(A_col_stride, A_row_stride, n, m)) { + transa = 'T'; + gemv_rows = (size_t)n; + gemv_cols = (size_t)m; + lda = (size_t)(A_col_stride / elsize); + } + else { + transa = 'N'; + gemv_rows = (size_t)m; + gemv_cols = (size_t)n; + lda = (size_t)n; + A_copy = 1; + } - result = qblas_gemv('R', 'N', m, n, &alpha, A_ptr, lda, B_ptr, incx, &beta, - C_ptr, incy); - break; - } + int x_copy = !matmul_stride_ok(B_row_stride); + int y_copy = !matmul_stride_ok(C_row_stride); + size_t incx = x_copy ? 1 : (size_t)(B_row_stride / elsize); + size_t incy = y_copy ? 1 : (size_t)(C_row_stride / elsize); - case MATMUL_GEMM: { - size_t lda = A_row_stride / sizeof(Sleef_quad); - size_t ldb = B_row_stride / sizeof(Sleef_quad); - size_t ldc_numpy = C_row_stride / sizeof(Sleef_quad); + size_t cntA = 0; + if (A_copy && matmul_alloc_count(m, n, &cntA) != 0) { + return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata); + } - memset(C_ptr, 0, m * p * sizeof(Sleef_quad)); + Sleef_quad *tA = NULL, *tx = NULL, *ty = NULL; + if (A_copy) tA = (Sleef_quad *)PyMem_RawMalloc(cntA * sizeof(Sleef_quad)); + if (x_copy) tx = (Sleef_quad *)PyMem_RawMalloc((size_t)n * sizeof(Sleef_quad)); + if (y_copy) ty = (Sleef_quad *)PyMem_RawMalloc((size_t)m * sizeof(Sleef_quad)); + if ((A_copy && !tA) || (x_copy && !tx) || (y_copy && !ty)) { + PyMem_RawFree(tA); + PyMem_RawFree(tx); + PyMem_RawFree(ty); + PyErr_NoMemory(); + return -1; + } - result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, A_ptr, lda, B_ptr, ldb, - &beta, C_ptr, ldc_numpy); + int result = 0; + for (npy_intp batch = 0; batch < N; batch++) { + char *A_b = A_base + batch * A_stride; + char *B_b = B_base + batch * B_stride; + char *C_b = C_base + batch * C_stride; + Sleef_quad *A_use = (Sleef_quad *)A_b; + Sleef_quad *x_use = (Sleef_quad *)B_b; + Sleef_quad *y_use = (Sleef_quad *)C_b; + if (A_copy) { + matmul_gather(tA, A_b, m, n, A_row_stride, A_col_stride); + A_use = tA; + } + if (x_copy) { + matmul_gather(tx, B_b, n, 1, B_row_stride, elsize); + x_use = tx; + } + if (y_copy) { + y_use = ty; + } + result = qblas_gemv('R', transa, gemv_rows, gemv_cols, &alpha, A_use, lda, + x_use, incx, &beta, y_use, incy); + if (result == 0 && y_copy) { + matmul_scatter(C_b, ty, m, 1, C_row_stride, elsize); + } + if (result != 0) { break; } - - default: - PyErr_SetString(PyExc_RuntimeError, - "internal error: unknown MatmulOperationType"); - return -1; } - + PyMem_RawFree(tA); + PyMem_RawFree(tx); + PyMem_RawFree(ty); if (result != 0) { PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); return -1; } + return 0; } - return 0; -} - -static int -quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], - npy_intp const dimensions[], npy_intp const strides[], - NpyAuxData *auxdata) -{ - // Outer (broadcast / batch) loop length and core matrix dims. - npy_intp N = dimensions[0]; - npy_intp m = dimensions[1]; - npy_intp n = dimensions[2]; - npy_intp p = dimensions[3]; - - // Strides between batches for each operand. - npy_intp A_stride = strides[0]; - npy_intp B_stride = strides[1]; - npy_intp C_stride = strides[2]; - - // Core strides aren't needed here: this branch copies each batch into - // a contiguous temp buffer before passing it to QBLAS. - - QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0]; - if (descr->backend != BACKEND_SLEEF) { - PyErr_SetString(PyExc_NotImplementedError, - "QBLAS-accelerated matmul only supports SLEEF backend. " - "Please raise the issue at SwayamInSync/QBLAS for longdouble support"); - return -1; + char transa, transb; + size_t lda, ldb, ldc; + int A_copy = 0, B_copy = 0, C_copy = 0; + if (is_blasable2d(A_row_stride, A_col_stride, m, n)) { + transa = 'N'; + lda = (size_t)(A_row_stride / elsize); + } + else if (is_blasable2d(A_col_stride, A_row_stride, n, m)) { + transa = 'T'; + lda = (size_t)(A_col_stride / elsize); + } + else { + transa = 'N'; + lda = (size_t)n; + A_copy = 1; + } + if (is_blasable2d(B_row_stride, B_col_stride, n, p)) { + transb = 'N'; + ldb = (size_t)(B_row_stride / elsize); + } + else if (is_blasable2d(B_col_stride, B_row_stride, p, n)) { + transb = 'T'; + ldb = (size_t)(B_col_stride / elsize); + } + else { + transb = 'N'; + ldb = (size_t)p; + B_copy = 1; + } + if (is_blasable2d(C_row_stride, C_col_stride, m, p)) { + ldc = (size_t)(C_row_stride / elsize); + } + else { + ldc = (size_t)p; + C_copy = 1; } - MatmulOperationType op_type = determine_operation_type(m, n, p); - Sleef_quad alpha = Sleef_cast_from_doubleq1(1.0); - Sleef_quad beta = Sleef_cast_from_doubleq1(0.0); - - char *A_base = data[0]; - char *B_base = data[1]; - char *C_base = data[2]; - - // Allocate the contiguous temp buffers once, reuse them across batches. - Sleef_quad *temp_A_buffer = nullptr; - Sleef_quad *temp_B_buffer = nullptr; - Sleef_quad *temp_C_buffer = nullptr; + size_t cntA = 0, cntB = 0, cntC = 0; + if ((A_copy && matmul_alloc_count(m, n, &cntA) != 0) || + (B_copy && matmul_alloc_count(n, p, &cntB) != 0) || + (C_copy && matmul_alloc_count(m, p, &cntC) != 0)) { + return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata); + } - switch (op_type) { - case MATMUL_DOT: - temp_A_buffer = new Sleef_quad[n]; - temp_B_buffer = new Sleef_quad[n]; - break; - case MATMUL_GEMV: - case MATMUL_GEMM: - temp_A_buffer = new Sleef_quad[m * n]; - temp_B_buffer = new Sleef_quad[n * p]; - temp_C_buffer = new Sleef_quad[m * p]; - break; - default: - PyErr_SetString(PyExc_RuntimeError, - "internal error: unknown MatmulOperationType"); - return -1; + Sleef_quad *tA = NULL, *tB = NULL, *tC = NULL; + if (A_copy) tA = (Sleef_quad *)PyMem_RawMalloc(cntA * sizeof(Sleef_quad)); + if (B_copy) tB = (Sleef_quad *)PyMem_RawMalloc(cntB * sizeof(Sleef_quad)); + if (C_copy) tC = (Sleef_quad *)PyMem_RawMalloc(cntC * sizeof(Sleef_quad)); + if ((A_copy && !tA) || (B_copy && !tB) || (C_copy && !tC)) { + PyMem_RawFree(tA); + PyMem_RawFree(tB); + PyMem_RawFree(tC); + PyErr_NoMemory(); + return -1; } int result = 0; for (npy_intp batch = 0; batch < N; batch++) { - Sleef_quad *A_batch = (Sleef_quad *)(A_base + batch * A_stride); - Sleef_quad *B_batch = (Sleef_quad *)(B_base + batch * B_stride); - Sleef_quad *C_batch = (Sleef_quad *)(C_base + batch * C_stride); - - switch (op_type) { - case MATMUL_DOT: { - memcpy(temp_A_buffer, A_batch, n * sizeof(Sleef_quad)); - memcpy(temp_B_buffer, B_batch, n * sizeof(Sleef_quad)); - result = qblas_dot(n, temp_A_buffer, 1, temp_B_buffer, 1, C_batch); - break; - } - - case MATMUL_GEMV: { - memcpy(temp_A_buffer, A_batch, m * n * sizeof(Sleef_quad)); - memcpy(temp_B_buffer, B_batch, n * p * sizeof(Sleef_quad)); - memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad)); - - result = qblas_gemv('R', 'N', m, n, &alpha, temp_A_buffer, n, - temp_B_buffer, 1, &beta, temp_C_buffer, 1); - if (result == 0) { - memcpy(C_batch, temp_C_buffer, m * p * sizeof(Sleef_quad)); - } - break; - } - - case MATMUL_GEMM: { - memcpy(temp_A_buffer, A_batch, m * n * sizeof(Sleef_quad)); - memcpy(temp_B_buffer, B_batch, n * p * sizeof(Sleef_quad)); - memset(temp_C_buffer, 0, m * p * sizeof(Sleef_quad)); - - result = qblas_gemm('R', 'N', 'N', m, p, n, &alpha, temp_A_buffer, n, - temp_B_buffer, p, &beta, temp_C_buffer, p); - if (result == 0) { - memcpy(C_batch, temp_C_buffer, m * p * sizeof(Sleef_quad)); - } - break; - } - - default: - PyErr_SetString(PyExc_RuntimeError, - "internal error: unknown MatmulOperationType"); - result = -1; - break; + char *A_b = A_base + batch * A_stride; + char *B_b = B_base + batch * B_stride; + char *C_b = C_base + batch * C_stride; + Sleef_quad *A_use = (Sleef_quad *)A_b; + Sleef_quad *B_use = (Sleef_quad *)B_b; + Sleef_quad *C_use = (Sleef_quad *)C_b; + if (A_copy) { + matmul_gather(tA, A_b, m, n, A_row_stride, A_col_stride); + A_use = tA; + } + if (B_copy) { + matmul_gather(tB, B_b, n, p, B_row_stride, B_col_stride); + B_use = tB; + } + if (C_copy) { + C_use = tC; + } + result = qblas_gemm('R', transa, transb, (size_t)m, (size_t)p, (size_t)n, + &alpha, A_use, lda, B_use, ldb, &beta, C_use, ldc); + if (result == 0 && C_copy) { + matmul_scatter(C_b, tC, m, p, C_row_stride, C_col_stride); } - if (result != 0) { break; } } - - delete[] temp_A_buffer; - delete[] temp_B_buffer; - delete[] temp_C_buffer; // delete[] on nullptr is a no-op - + PyMem_RawFree(tA); + PyMem_RawFree(tB); + PyMem_RawFree(tC); if (result != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); - } + PyErr_SetString(PyExc_RuntimeError, "QBLAS operation failed"); return -1; } - return 0; } +static int +quad_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], npy_intp const strides[], + NpyAuxData *auxdata) +{ + return naive_matmul_strided_loop(context, data, dimensions, strides, auxdata); +} + static int naive_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], npy_intp const strides[], diff --git a/tests/test_dot.py b/tests/test_dot.py index 76dd857..85b26b9 100644 --- a/tests/test_dot.py +++ b/tests/test_dot.py @@ -941,15 +941,10 @@ def test_1d_at_3d_collapsed_vecmat(self): # ---------- Non-contiguous / Fortran-order batched matrices ---------- - @pytest.mark.xfail( - reason="F-contiguous matmul is broken even at 2D — separate pre-existing " - "bug in the QBLAS dispatch path (treats col-major data as row-major). " - "Not related to the batch-loop fix; tracked separately." - ) def test_batched_fortran_order(self): - """F-contiguous batched arrays. Currently xfails due to a separate - pre-existing bug in the matmul aligned strided loop: it always passes - layout 'R' to qblas_gemm regardless of stride pattern.""" + """F-contiguous batched arrays (issue #89). The matmul loop now derives + the QBLAS transpose flag from each operand's strides instead of always + passing layout 'R', so col-major data is handled correctly.""" rng = np.random.default_rng(seed=20) A_f = np.asfortranarray(rng.standard_normal((3, 4, 5))) B_f = np.asfortranarray(rng.standard_normal((3, 5, 6))) @@ -969,16 +964,11 @@ def test_batched_non_contiguous_slice(self): B_q = _qnd(B_full_f)[::2] _assert_matmul_matches_float64(A_q, B_q, A_f, B_f) - @pytest.mark.xfail( - reason="A.swapaxes view yields a per-batch col-major matrix; same " - "pre-existing layout-dispatch bug as test_batched_fortran_order. " - "Not related to the batch-loop fix; tracked separately." - ) def test_batched_transposed_view(self): - """A.swapaxes-based view of a 3D array as the batched matrix. + """A.swapaxes-based view of a 3D array as the batched matrix (issue #89). - After swapping the last two axes, each batch is col-major. Like - F-order arrays, this hits the same pre-existing layout-dispatch bug. + After swapping the last two axes, each batch is col-major; the loop now + passes transa='T' for it rather than mis-reading it as row-major. """ rng = np.random.default_rng(seed=22) A_full_f = rng.standard_normal((3, 5, 4)) # shape (B, K, M) @@ -1054,3 +1044,119 @@ def test_batched_larger(self, batch, m, k, n): B_f = rng.standard_normal((batch, k, n)) _assert_matmul_matches_float64(_qnd(A_f), _qnd(B_f), A_f, B_f, rtol=1e-13, atol=1e-13) + + +class TestMatmulLayouts: + """Issue #89: matmul must give correct results for every memory layout, not + just row-major. Inputs are dispatched to QBLAS with the transpose flag + derived from each operand's strides, and non-BLAS-able operands fall back to + the naive loop. Every case is cross-checked against float64.""" + + @staticmethod + def _layout(a, kind): + if kind == "C": + return np.ascontiguousarray(a) + if kind == "F": + return np.asfortranarray(a) + if kind == "T": + return np.ascontiguousarray(a.T).T + if kind == "S": + big = np.zeros((a.shape[0] * 2, a.shape[1] * 2), dtype=a.dtype) + big[::2, ::2] = a + return big[::2, ::2] + raise ValueError(kind) + + @pytest.mark.parametrize("a_kind", ["C", "F", "T", "S"]) + @pytest.mark.parametrize("b_kind", ["C", "F", "T", "S"]) + def test_gemm_all_layout_combinations(self, a_kind, b_kind): + rng = np.random.default_rng(seed=100) + A_f = rng.standard_normal((4, 3)) + B_f = rng.standard_normal((3, 5)) + A_q = self._layout(_qnd(A_f), a_kind) + B_q = self._layout(_qnd(B_f), b_kind) + _assert_matmul_matches_float64(A_q, B_q, A_f, B_f) + + @pytest.mark.parametrize("out_kind", ["C", "F"]) + def test_gemm_output_layout(self, out_kind): + rng = np.random.default_rng(seed=101) + A_f = rng.standard_normal((4, 3)) + B_f = rng.standard_normal((3, 5)) + A_q, B_q = _qnd(A_f), _qnd(B_f) + ref = A_f @ B_f + out = np.zeros((4, 5), dtype=QuadPrecDType()) + if out_kind == "F": + out = np.asfortranarray(out) + np.matmul(A_q, B_q, out=out) + for i in range(4): + for j in range(5): + assert abs(float(out[i, j]) - ref[i, j]) <= 1e-13 + 1e-13 * max(abs(ref[i, j]), 1.0) + + @pytest.mark.parametrize("a_kind", ["C", "F", "T", "S"]) + def test_gemv_all_layouts(self, a_kind): + rng = np.random.default_rng(seed=102) + A_f = rng.standard_normal((5, 4)) + x_f = rng.standard_normal((4,)) + A_q = self._layout(_qnd(A_f), a_kind) + _assert_matmul_matches_float64(A_q, _qnd(x_f), A_f, x_f) + + @pytest.mark.parametrize("stride", [2, 3]) + def test_dot_strided_vectors(self, stride): + rng = np.random.default_rng(seed=103) + x_f = rng.standard_normal((7 * stride,)) + y_f = rng.standard_normal((7 * stride,)) + _assert_matmul_matches_float64(_qnd(x_f)[::stride], _qnd(y_f)[::stride], + x_f[::stride], y_f[::stride]) + + @pytest.mark.parametrize("A_shape,B_shape", [ + ((0, 3), (3, 4)), + ((4, 0), (0, 5)), + ((3, 4), (4, 0)), + ]) + def test_empty_core_dims_match_float64(self, A_shape, B_shape): + A_f = np.ones(A_shape) + B_f = np.ones(B_shape) + _assert_matmul_matches_float64(_qnd(A_f), _qnd(B_f), A_f, B_f) + + +class TestMatmulCopyPath: + """Issue #89 Tier 2: layouts that are not directly BLAS-able (fully strided, + negative strides, non-row-major output) are copied into contiguous temps and + still routed through QBLAS rather than the naive loop. All cross-checked + against float64.""" + + def test_fully_strided_both_inputs(self): + rng = np.random.default_rng(seed=200) + A_f = rng.standard_normal((20, 16)) + B_f = rng.standard_normal((16, 24)) + _assert_matmul_matches_float64(_qnd(A_f)[::2, ::2], _qnd(B_f)[::2, ::2], + A_f[::2, ::2], B_f[::2, ::2]) + + def test_negative_stride_input(self): + rng = np.random.default_rng(seed=201) + A_f = rng.standard_normal((6, 5)) + B_f = rng.standard_normal((5, 7)) + _assert_matmul_matches_float64(_qnd(A_f)[::-1], _qnd(B_f), A_f[::-1], B_f) + + def test_strided_A_fortran_B_fortran_out_three_temps(self): + rng = np.random.default_rng(seed=202) + A_f = rng.standard_normal((20, 7)) + B_f = rng.standard_normal((7, 11)) + out = np.asfortranarray(np.zeros((10, 11), dtype=QuadPrecDType())) + np.matmul(_qnd(A_f)[::2], np.asfortranarray(_qnd(B_f)), out=out) + ref = A_f[::2] @ B_f + for i in range(10): + for j in range(11): + assert abs(float(out[i, j]) - ref[i, j]) <= 1e-12 + 1e-12 * max(abs(ref[i, j]), 1.0) + + def test_gemv_fully_strided_matrix(self): + rng = np.random.default_rng(seed=203) + A_f = rng.standard_normal((12, 10)) + x_f = rng.standard_normal((10,)) + _assert_matmul_matches_float64(_qnd(A_f)[::2, ::2], _qnd(x_f)[:5], + A_f[::2, ::2], x_f[:5]) + + def test_gemv_negative_stride_vector(self): + rng = np.random.default_rng(seed=204) + A_f = rng.standard_normal((5, 6)) + x_f = rng.standard_normal((6,)) + _assert_matmul_matches_float64(_qnd(A_f), _qnd(x_f)[::-1], A_f, x_f[::-1]) From 784a75fd86255e82dee42549d1e497a6d718ce2c Mon Sep 17 00:00:00 2001 From: SwayamInSync Date: Fri, 5 Jun 2026 18:27:48 +0000 Subject: [PATCH 2/2] gemv null handling --- src/csrc/quadblas_interface.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/csrc/quadblas_interface.cpp b/src/csrc/quadblas_interface.cpp index efdf5b8..0335e74 100644 --- a/src/csrc/quadblas_interface.cpp +++ b/src/csrc/quadblas_interface.cpp @@ -46,14 +46,18 @@ qblas_gemv(char layout, char trans, size_t m, size_t n, Sleef_quad *beta, Sleef_quad *y, size_t incy) { if (m == 0 || n == 0) { - if (y && beta) { - int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c'); - size_t y_len = do_trans ? n : m; - Sleef_quad zero = Sleef_cast_from_doubleq1(0.0); - int beta_zero = Sleef_icmpeqq1(*beta, zero); - for (size_t i = 0; i < y_len; i++) { - y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]); - } + int do_trans = (trans == 'T' || trans == 't' || trans == 'C' || trans == 'c'); + size_t y_len = do_trans ? n : m; + if (y_len == 0) { + return 0; + } + if (!y || !beta) { + return -1; + } + Sleef_quad zero = Sleef_cast_from_doubleq1(0.0); + int beta_zero = Sleef_icmpeqq1(*beta, zero); + for (size_t i = 0; i < y_len; i++) { + y[i * incy] = beta_zero ? zero : Sleef_mulq1_u05(*beta, y[i * incy]); } return 0; }