From 14cafb45fc6bf62b25592d8cbd66602b6f2f4e58 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Tue, 7 Apr 2026 15:45:58 -0700
Subject: [PATCH 1/7] checkpoint

---
 Common/include/linear_algebra/CSysVector.hpp  | 27 +++++++++++++++++++
 .../src/drivers/CDiscAdjMultizoneDriver.cpp   |  4 +--
 2 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index f300c01e091..fdcde50c113 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
 #include "../parallelization/mpi_structure.hpp"
 #include "../parallelization/omp_structure.hpp"
@@ -371,6 +372,32 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
     return dot_scratch[0];
   }
 
+  /*!
+   * \brief Computes the product of V^T W efficiencly, where V and W are tall matrices stored as vectors of CSysVector.
+   * \param[in] V - Tall matrix.
+   * \param[in] n - Number of columns to consider from V (if 0, the size of V is used).
+   * \param[in] W - Tall matrix.
+   * \param[in] m - Number of columns to consider from W (if 0, the size of W is used).
+   * \param[out] VTW - Matrix to store the product, must be n by m or larger.
+   */
+  template <class Mat>
+  static void multiDot(const std::vector<CSysVector>& V, size_t n, const std::vector<CSysVector>& W, size_t m,
+                       Mat& VTW) {
+    static constexpr size_t BLOCK_SIZE = 1024;
+
+    if (n == 0) n = V.size();
+    if (m == 0) m = W.size();
+    if (n == 0 || m == 0) return;
+
+    su2matrix<ScalarType> local;
+    local.resize(n, m) = ScalarType{};
+
+    for (size_t i = 0; i < n; ++i) {
+      for (size_t j = 0; j < m; ++j) {
+      }
+    }
+  }
+
   /*!
    * \brief Squared L2 norm of the vector (via dot with self).
    * \return Squared L2 norm.
diff --git a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
index 392abeebe9f..315ae662e7c 100644
--- a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
+++ b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
@@ -380,8 +380,8 @@ void CDiscAdjMultizoneDriver::KrylovInnerIters(unsigned short iZone) {
     Scalar eps_l = 0.0;
     Scalar tol_l = KrylovTol / eps;
     auto iter = min(totalIter-2ul, config_container[iZone]->GetnQuasiNewtonSamples()-2ul);
-    iter = LinSolver[iZone].FGMRES_LinSolver(AdjRHS[iZone], AdjSol[iZone], product, Identity(),
-                                             tol_l, iter, eps_l, monitor, config_container[iZone]);
+    iter = LinSolver[iZone].FGCRODR_LinSolver(AdjRHS[iZone], AdjSol[iZone], product, Identity(),
+                                              tol_l, iter, eps_l, monitor, config_container[iZone]);
     totalIter -= iter+1;
     eps *= eps_l;
   }

From 4cdf922e273b74d88be05c92fb6973013d46c558 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Wed, 8 Apr 2026 00:34:12 -0700
Subject: [PATCH 2/7] optimize product

---
 Common/include/linear_algebra/CSysVector.hpp | 25 ++------
 Common/src/linear_algebra/CSysSolve.cpp      |  4 +-
 Common/src/linear_algebra/CSysVector.cpp     | 67 +++++++++++++++++++-
 3 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index fdcde50c113..316b4d8e993 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -375,28 +375,13 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   /*!
    * \brief Computes the product of V^T W efficiencly, where V and W are tall matrices stored as vectors of CSysVector.
    * \param[in] V - Tall matrix.
-   * \param[in] n - Number of columns to consider from V (if 0, the size of V is used).
+   * \param[in] n - Number of columns to consider from V.
    * \param[in] W - Tall matrix.
-   * \param[in] m - Number of columns to consider from W (if 0, the size of W is used).
-   * \param[out] VTW - Matrix to store the product, must be n by m or larger.
+   * \param[in] m - Number of columns to consider from W.
+   * \return n by m matrix with the result of the product.
    */
-  template <class Mat>
-  static void multiDot(const std::vector<CSysVector>& V, size_t n, const std::vector<CSysVector>& W, size_t m,
-                       Mat& VTW) {
-    static constexpr size_t BLOCK_SIZE = 1024;
-
-    if (n == 0) n = V.size();
-    if (m == 0) m = W.size();
-    if (n == 0 || m == 0) return;
-
-    su2matrix<ScalarType> local;
-    local.resize(n, m) = ScalarType{};
-
-    for (size_t i = 0; i < n; ++i) {
-      for (size_t j = 0; j < m; ++j) {
-      }
-    }
-  }
+  static const su2matrix<ScalarType>& multiDot(const std::vector<CSysVector>& V, size_t n,
+                                               const std::vector<CSysVector>& W, size_t m);
 
   /*!
    * \brief Squared L2 norm of the vector (via dot with self).
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index ce454326546..af1be89e403 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -852,10 +852,10 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     /*--- Compute Ritz values and keep the ones with the smallest real part. ---*/
 
     EigenMatrix VW = EigenMatrix::Identity(m + 1, m);
+    const auto& tmp = CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
     for (auto i = 0ul; i <= m; ++i) {
       for (auto j = 0ul; j < k; ++j) {
-        // TODO(pedro): There are clever ways to avoid this multiplication, or at least use BLAS.
-        VW(i, j) = V[i].dot(W[j]);
+        VW(i, j) = tmp(i, j);
       }
     }
     const auto Hm = Heigen.topLeftCorner(m + 1, m);
diff --git a/Common/src/linear_algebra/CSysVector.cpp b/Common/src/linear_algebra/CSysVector.cpp
index d14a63b52bf..5d0009f39fa 100644
--- a/Common/src/linear_algebra/CSysVector.cpp
+++ b/Common/src/linear_algebra/CSysVector.cpp
@@ -67,10 +67,75 @@ void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numB
   }
 }
 
+template <class ScalarType>
+const su2matrix<ScalarType>& CSysVector<ScalarType>::multiDot(const std::vector<CSysVector<ScalarType>>& V,
+                                                              const size_t n,
+                                                              const std::vector<CSysVector<ScalarType>>& W,
+                                                              const size_t m) {
+  static constexpr size_t BLOCK_SIZE = 1024;
+  static su2matrix<ScalarType> shared;
+
+  if (n == 0 || m == 0) return shared;
+
+  SU2_OMP_BARRIER
+  const auto size = V[0].nElmDomain;
+
+  su2matrix<ScalarType> local(n, m);
+  local.setConstant(0);
+
+  SU2_OMP_FOR_(schedule(static) SU2_NOWAIT)
+  for (size_t offset = 0; offset < size; offset += BLOCK_SIZE) {
+    const auto limit = std::min(offset + BLOCK_SIZE, size);
+    for (size_t i = 0; i < n; ++i) {
+      const auto& vi = V[i];
+      for (size_t j = 0; j < m; ++j) {
+        const auto& wj = W[j];
+        ScalarType sum = 0.0;
+        SU2_OMP_SIMD
+        for (auto k = offset; k < limit; ++k) {
+          sum += vi[k] * wj[k];
+        }
+        local(i, j) += sum;
+      }
+    }
+  }
+  END_SU2_OMP_FOR
+
+  /*--- Reduce over all threads in an ordered way to ensure a deterministic result. ---*/
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < m; ++j) {
+      W[j].dot_scratch[omp_get_thread_num()] = local(i, j);
+    }
+    BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS
+    for (size_t j = 0; j < m; ++j) {
+      for (int t = 1; t < omp_get_num_threads(); ++t) {
+        local(i, j) += W[j].dot_scratch[t];
+      }
+    }
+    END_SU2_OMP_SAFE_GLOBAL_ACCESS
+  }
+
+  /*--- Single AllReduce of the result, only the master thread communicates. ---*/
+  SU2_OMP_MASTER {
+    shared.resize(n, m);
+
+    const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE;
+    SelectMPIWrapper<ScalarType>::W::Allreduce(local.data(), shared.data(), n * m, mpi_type, MPI_SUM,
+                                               SU2_MPI::GetComm());
+  }
+  END_SU2_OMP_MASTER
+
+  /*--- All threads have the same view of the result. ---*/
+  SU2_OMP_BARRIER
+
+  return shared;
+}
+
 template <class ScalarType>
 CSysVector<ScalarType>::~CSysVector() {
-  if (!std::is_trivial<ScalarType>::value)
+  if constexpr (!std::is_trivial_v<ScalarType>) {
     for (auto i = 0ul; i < nElm; i++) vec_val[i].~ScalarType();
+  }
   MemoryAllocation::aligned_free(vec_val);
 
   GPUMemoryAllocation::gpu_free(d_vec_val);

From d8df0b8cdd326d9ef44af1029a30bf2fcb08924d Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Wed, 8 Apr 2026 10:54:03 -0700
Subject: [PATCH 3/7] nested parallel improvements

---
 Common/src/linear_algebra/CSysSolve.cpp       | 64 ++++++++++++-------
 .../src/drivers/CDiscAdjMultizoneDriver.cpp   |  3 +-
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index af1be89e403..af178fcd49e 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -62,8 +62,8 @@ constexpr float linSolEpsilon<float>() {
 
 /*--- Computes v = vs * ws or v += vs * ws with unrolling of up to 4 iterations. ---*/
 template <class ScalarType, class Weights, class Vectors>
-void LinearCombination(const unsigned long n, const Vectors& vs, const Weights& ws, CSysVector<ScalarType>& v,
-                       bool inc = false) {
+void LinearCombinationImpl(const unsigned long n, const Vectors& vs, const Weights& ws, CSysVector<ScalarType>& v,
+                           bool inc = false) {
   if (n == 0) {
     if (!inc) v = ScalarType{};
     return;
@@ -105,19 +105,32 @@ void LinearCombination(const unsigned long n, const Vectors& vs, const Weights&
 
 /*--- Overload to handle a vector of CSysVector directly. ---*/
 template <class ScalarType, class Weights>
-void LinearCombination(const unsigned long n, const std::vector<CSysVector<ScalarType>>& vs, const Weights& ws,
-                       CSysVector<ScalarType>& v, bool inc = false) {
-  LinearCombination(
+void LinearCombinationImpl(const unsigned long n, const std::vector<CSysVector<ScalarType>>& vs, const Weights& ws,
+                           CSysVector<ScalarType>& v, bool inc = false) {
+  LinearCombinationImpl(
       n, [&vs](auto i) -> auto& { return vs[i]; }, ws, v, inc);
 }
 
 /*--- Overload to handle a std::vector<T> of weights directly. ---*/
 template <class ScalarType, class Vectors>
-void LinearCombination(const unsigned long n, const Vectors& vs, const std::vector<ScalarType>& ws,
-                       CSysVector<ScalarType>& v, bool inc = false) {
-  LinearCombination(
+void LinearCombinationImpl(const unsigned long n, const Vectors& vs, const std::vector<ScalarType>& ws,
+                           CSysVector<ScalarType>& v, bool inc = false) {
+  LinearCombinationImpl(
       n, vs, [&ws](auto i) { return ws[i]; }, v, inc);
 }
+
+/*--- Wrapper around LinearCombinationImpl. ---*/
+template <class... Ts>
+void LinearCombination(bool parallel, Ts&&... args) {
+  if (parallel) {
+    SU2_OMP_PARALLEL
+    LinearCombinationImpl(std::forward<Ts>(args)...);
+    END_SU2_OMP_PARALLEL
+  } else {
+    LinearCombinationImpl(std::forward<Ts>(args)...);
+  }
+}
+
 }  // namespace
 
 template <class ScalarType>
@@ -578,13 +591,7 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   const auto& basis = flexible ? Z : V;
 
-  if (nestedParallel) {
-    SU2_OMP_PARALLEL
-    LinearCombination(i, basis, y, x, true);
-    END_SU2_OMP_PARALLEL
-  } else {
-    LinearCombination(i, basis, y, x, true);
-  }
+  LinearCombination(nestedParallel, i, basis, y, x, true);
 
   /*---  Recalculate final (neg.) residual (this should be optional) ---*/
 
@@ -730,7 +737,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       /*--- Make r orthogonal to the rebuilt V so we can proceed with the usual Arnoldi process. ---*/
       vr(j) = r.dot(V[j]);
     }
-    LinearCombination(k, V, -vr, r, true);
+    LinearCombination(nestedParallel, k, V, -vr, r, true);
 
     /*--- Apply R^-1 to Z and W and update x accordingly. R is uppper triangular,
      * so we loop backwards to compute the products in-place. ---*/
@@ -740,12 +747,12 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
         for (auto* basis : {&W, &Z}) {
           auto reversed = [&](auto i) -> const auto& { return (*basis)[j - i]; };
           LinearCombination(
-              j + 1, reversed, [&](auto i) { return invR(j - i, j); }, (*basis)[j]);
+              nestedParallel, j + 1, reversed, [&](auto i) { return invR(j - i, j); }, (*basis)[j]);
         }
         if (j == 0) break;  // j is unsigned, avoid underflow.
       }
     }
-    LinearCombination(k, Z, vr, x, true);
+    LinearCombination(nestedParallel, k, Z, vr, x, true);
   }
   ScalarType rNorm = r.norm();
   auto iter = k;
@@ -839,8 +846,8 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
 
     /*--- Update the solution and residual. The latter is only required if we restart. ---*/
 
-    LinearCombination(m, Z, y, x, true);
-    if (!converged) LinearCombination(m + 1, V, rls, r);
+    LinearCombination(nestedParallel, m, Z, y, x, true);
+    if (!converged) LinearCombination(nestedParallel, m + 1, V, rls, r);
 
     /*--- Update deflation vectors. ---*/
 
@@ -852,10 +859,21 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     /*--- Compute Ritz values and keep the ones with the smallest real part. ---*/
 
     EigenMatrix VW = EigenMatrix::Identity(m + 1, m);
-    const auto& tmp = CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+    const su2matrix<ScalarType>* VWk = nullptr;
+    if (nestedParallel) {
+      SU2_OMP_PARALLEL {
+        const auto& tmp = CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+        SU2_OMP_MASTER
+        VWk = &tmp;
+        END_SU2_OMP_MASTER
+      }
+      END_SU2_OMP_PARALLEL
+    } else {
+      VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+    }
     for (auto i = 0ul; i <= m; ++i) {
       for (auto j = 0ul; j < k; ++j) {
-        VW(i, j) = tmp(i, j);
+        VW(i, j) = (*VWk)(i, j);
       }
     }
     const auto Hm = Heigen.topLeftCorner(m + 1, m);
@@ -917,7 +935,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
 
     auto modify = [&](const EigenMatrix& mod, const auto& basis) {
       for (auto j = 0ul; j < k_new; ++j) {
-        LinearCombination(mod.rows(), basis, mod.col(j), T[j]);
+        LinearCombination(nestedParallel, mod.rows(), basis, mod.col(j), T[j]);
       }
     };
     modify(
diff --git a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
index 315ae662e7c..b31926271cf 100644
--- a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
+++ b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
@@ -381,7 +381,8 @@ void CDiscAdjMultizoneDriver::KrylovInnerIters(unsigned short iZone) {
     Scalar tol_l = KrylovTol / eps;
     auto iter = min(totalIter-2ul, config_container[iZone]->GetnQuasiNewtonSamples()-2ul);
     iter = LinSolver[iZone].FGCRODR_LinSolver(AdjRHS[iZone], AdjSol[iZone], product, Identity(),
-                                              tol_l, iter, eps_l, monitor, config_container[iZone]);
+                                              tol_l, iter, eps_l, monitor, config_container[iZone],
+                                              FgcrodrMode::SAME_MAT);
     totalIter -= iter+1;
     eps *= eps_l;
   }

From 972b9b212468da185341db2b3afb15bf996593b3 Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Fri, 10 Apr 2026 04:36:55 +0100
Subject: [PATCH 4/7] add test, non flexible mode for FGCRODR

---
 Common/include/linear_algebra/CSysSolve.hpp |   6 ++
 Common/src/linear_algebra/CSysSolve.cpp     |  91 ++++++++++-------
 TestCases/vandv.py                          |  14 ++-
 TestCases/vandv/rans/30p30n/config_ad.cfg   | 104 ++++++++++++++++++++
 4 files changed, 178 insertions(+), 37 deletions(-)
 create mode 100644 TestCases/vandv/rans/30p30n/config_ad.cfg

diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index 611223b657b..ec083535eef 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -41,6 +41,11 @@
 #include "CSysVector.hpp"
 #include "../option_structure.hpp"
 
+SU2_IGNORE_WARNING("-Wmaybe-uninitialized")
+#include "Eigen/Core"
+#include "Eigen/Dense"
+SU2_RESTORE_WARNING
+
 class CConfig;
 class CGeometry;
 template <class T>
@@ -110,6 +115,7 @@ class CSysSolve {
   mutable unsigned long k = 0;
   mutable std::vector<VectorType> Z, V; /*!< \brief Large matrices used by FGMRES, v^i+1 = A * z^i. */
   mutable std::vector<VectorType> W, T; /*!< \brief Large matrices used by FGCRODR for deflation vectors. */
+  mutable Eigen::Matrix<ScalarType, Eigen::Dynamic, Eigen::Dynamic> VWk;
 
   /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
   VectorType LinSysSol_tmp;
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index af178fcd49e..2141c263344 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -35,8 +35,6 @@
 #include "../../include/linear_algebra/CPreconditioner.hpp"
 
 SU2_IGNORE_WARNING("-Wmaybe-uninitialized")
-#include "Eigen/Core"
-#include "Eigen/Dense"
 #include "Eigen/Eigenvalues"
 SU2_RESTORE_WARNING
 
@@ -661,6 +659,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
   const auto deflation = min(config->GetLinear_Solver_Restart_Deflation(), m - 1);
 
   const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
+  const bool flexible = !precond.IsIdentity();
   /*--- If we call the solver outside of a parallel region, but the number of threads allows,
    * we still want to parallelize some of the expensive operations. ---*/
   const bool nestedParallel = !omp_in_parallel() && omp_get_max_threads() > 1;
@@ -685,8 +684,10 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       r.Initialize(nBlk, nBlkDomain, nVar, nullptr);
       V.resize(m + 1);
       for (auto& v : V) v.Initialize(nBlk, nBlkDomain, nVar, nullptr);
-      Z.resize(m);
-      for (auto& z : Z) z.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      if (flexible) {
+        Z.resize(m);
+        for (auto& z : Z) z.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      }
       W.resize(deflation + 1);
       for (auto& w : W) w.Initialize(nBlk, nBlkDomain, nVar, nullptr);
       T.resize(deflation + 1);
@@ -707,12 +708,21 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
   /*--- Calculate the initial residual and compute its norm. ---*/
 
   if (!xIsZero) {
-    mat_vec(x, Z.back());
-    r = b - Z.back();
+    mat_vec(x, T[0]);
+    r = b - T[0];
   } else {
     r = b;
   }
 
+  /*--- We don't store the part of W that is equal to V explicitly, W(:, k:m) = V(:, k:m). ---*/
+  auto GetW = [&](auto i) -> auto& { return i < k ? W[i] : V[i]; };
+
+  /*--- With an identity preconditioner Z = W. ---*/
+  auto GetZ = [&](auto i) -> auto& {
+    if (flexible) return Z[i];
+    return GetW(i);
+  };
+
   /*--- Rebuild Z, V, and W for the new matrix if necessary.
    * Q * R = A * Z
    * V = Q = A * (Z * R^-1)
@@ -725,7 +735,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       if (mode != FgcrodrMode::SAME_MAT) {
         /*--- When k = 0, Z = M(W), we could keep that property but it is not
          * critical and so we choose to save the cost of precond(W[j], Z[j]); ---*/
-        mat_vec(Z[j], V[j]);
+        mat_vec(GetZ(j), V[j]);
 
         for (auto i = 0ul; i < j; ++i) {
           R(i, j) = V[i].dot(V[j]);
@@ -748,11 +758,12 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
           auto reversed = [&](auto i) -> const auto& { return (*basis)[j - i]; };
           LinearCombination(
               nestedParallel, j + 1, reversed, [&](auto i) { return invR(j - i, j); }, (*basis)[j]);
+          if (!flexible) break;  // skip Z.
         }
         if (j == 0) break;  // j is unsigned, avoid underflow.
       }
     }
-    LinearCombination(nestedParallel, k, Z, vr, x, true);
+    LinearCombination(nestedParallel, k, GetZ, vr, x, true);
   }
   ScalarType rNorm = r.norm();
   auto iter = k;
@@ -800,8 +811,12 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     bool converged = false;
     for (auto j = k; j < m; ++j) {
       ++iter;
-      precond(V[j], Z[j]);
-      mat_vec(Z[j], V[j + 1]);
+      if (flexible) {
+        precond(V[j], Z[j]);
+        mat_vec(Z[j], V[j + 1]);
+      } else {
+        mat_vec(V[j], V[j + 1]);
+      }
 
       if (nestedParallel) {
         /*--- "omp parallel if" does not work well here ---*/
@@ -841,12 +856,9 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       }
     }
 
-    /*--- We don't store the part of W that is equal to V explicitly,
-     * W(:, k:m) = V(:, k:m). ---*/
-
     /*--- Update the solution and residual. The latter is only required if we restart. ---*/
 
-    LinearCombination(nestedParallel, m, Z, y, x, true);
+    LinearCombination(nestedParallel, m, GetZ, y, x, true);
     if (!converged) LinearCombination(nestedParallel, m + 1, V, rls, r);
 
     /*--- Update deflation vectors. ---*/
@@ -859,22 +871,23 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     /*--- Compute Ritz values and keep the ones with the smallest real part. ---*/
 
     EigenMatrix VW = EigenMatrix::Identity(m + 1, m);
-    const su2matrix<ScalarType>* VWk = nullptr;
-    if (nestedParallel) {
-      SU2_OMP_PARALLEL {
-        const auto& tmp = CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
-        SU2_OMP_MASTER
-        VWk = &tmp;
-        END_SU2_OMP_MASTER
+    if (mode != FgcrodrMode::SAME_MAT) {
+      const su2matrix<ScalarType>* VWk = nullptr;
+      if (nestedParallel) {
+        SU2_OMP_PARALLEL
+        VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+        END_SU2_OMP_PARALLEL
+      } else {
+        VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
       }
-      END_SU2_OMP_PARALLEL
-    } else {
-      VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
-    }
-    for (auto i = 0ul; i <= m; ++i) {
-      for (auto j = 0ul; j < k; ++j) {
-        VW(i, j) = (*VWk)(i, j);
+      for (auto i = 0ul; i <= m; ++i) {
+        for (auto j = 0ul; j < k; ++j) {
+          VW(i, j) = (*VWk)(i, j);
+        }
       }
+    } else if (k > 0) {
+      /*--- See notes near the end of the outer loop. ---*/
+      VW.topLeftCorner(k, k) = VWk.topRows(k);
     }
     const auto Hm = Heigen.topLeftCorner(m + 1, m);
     EigenMatrix HTVW = Hm.transpose() * VW;
@@ -938,10 +951,16 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
         LinearCombination(nestedParallel, mod.rows(), basis, mod.col(j), T[j]);
       }
     };
-    modify(
-        PinvR, [&](auto i) -> auto& { return i < k ? W[i] : V[i]; });
+    modify(PinvR, GetW);
 
     BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
+      /*--- Initialize VWk, then apply the V and W modifications of the left and right, respectively. ---*/
+      if (mode == FgcrodrMode::SAME_MAT) {
+        if (k == 0) {
+          VWk = EigenMatrix::Identity(m + 1, k_new);
+        }
+        VWk.topRows(k) = Q.transpose() * (VWk * PinvR);
+      }
       /*--- T and W are the same size, so we can swap them. ---*/
       std::swap(T, W);
       k = k_new;
@@ -956,8 +975,10 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       END_SU2_OMP_SAFE_GLOBAL_ACCESS
     };
 
-    modify(PinvR, Z);
-    update(Z);
+    if (flexible) {
+      modify(PinvR, Z);
+      update(Z);
+    }
 
     /*--- Update V only if necessary. ---*/
     if (!converged || mode == FgcrodrMode::SAME_MAT) {
@@ -976,9 +997,9 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
       END_SU2_OMP_MASTER
     }
     if (recomputeRes) {
-      mat_vec(x, Z.back());
-      Z.back() -= b;
-      ScalarType res = Z.back().norm();
+      mat_vec(x, T[0]);
+      T[0] -= b;
+      ScalarType res = T[0].norm();
 
       if (fabs(res - rNorm) > tol * 10) {
         if (masterRank) {
diff --git a/TestCases/vandv.py b/TestCases/vandv.py
index d207912bbfa..1eac30f0499 100644
--- a/TestCases/vandv.py
+++ b/TestCases/vandv.py
@@ -28,7 +28,7 @@
 # You should have received a copy of the GNU Lesser General Public
 # License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 
-import sys
+import sys, os
 from TestCase import TestCase
 
 def main():
@@ -48,6 +48,15 @@ def main():
     p30n30.test_vals = [-11.267106, -11.168215, -11.182822, -10.949673, -14.233489, 0.052235, 2.830394, 1.318894, -1.210648, 1, 1.2763e+01]
     test_list.append(p30n30)
 
+    os.symlink("vandv/rans/30p30n/solution.dat", "vandv/rans/30p30n/solution_0.dat")
+    p30n30_ad = TestCase('30P30N_ad')
+    p30n30_ad.cfg_dir = "vandv/rans/30p30n"
+    p30n30_ad.cfg_file = "config_ad.cfg"
+    p30n30_ad.test_iter = 5
+    p30n30_ad.test_vals = [-8.167332, -8.738471, -8.762033, -8.500107, -7.433292, -0.808628, -2.091805, 1.1791e-01, 3.4123e+01]
+    p30n30_ad.command = TestCase.Command("mpirun -n 2", "SU2_CFD_AD")
+    test_list.append(p30n30_ad)
+
     # flat plate - sst-v1994m
     flatplate_sst1994m           = TestCase('flatplate_sst1994m')
     flatplate_sst1994m.cfg_dir   = "vandv/rans/flatplate"
@@ -120,7 +129,8 @@ def main():
     #################
 
     for test in test_list:
-        test.command = TestCase.Command("mpirun -n 2", "SU2_CFD")
+        if test.command.empty():
+            test.command = TestCase.Command("mpirun -n 2", "SU2_CFD")
         test.timeout = 300
         test.tol = 1e-5
     #end
diff --git a/TestCases/vandv/rans/30p30n/config_ad.cfg b/TestCases/vandv/rans/30p30n/config_ad.cfg
new file mode 100644
index 00000000000..263be10bab4
--- /dev/null
+++ b/TestCases/vandv/rans/30p30n/config_ad.cfg
@@ -0,0 +1,104 @@
+% ------------------------------- SOLVER -------------------------------- %
+%
+SOLVER= RANS
+KIND_TURB_MODEL= SA
+REF_DIMENSIONALIZATION= FREESTREAM_VEL_EQ_MACH
+%
+OBJECTIVE_FUNCTION= LIFT
+%
+% ----------------------------- FREESTREAM ------------------------------ %
+%
+MACH_NUMBER= 0.17
+AOA= 5.5
+INIT_OPTION= REYNOLDS
+FREESTREAM_OPTION= TEMPERATURE_FS
+FREESTREAM_TEMPERATURE= 295.56
+REYNOLDS_NUMBER= 1.71E6
+REYNOLDS_LENGTH= 0.4572
+%
+% -------------------------- REFERENCE VALUES --------------------------- %
+%
+REF_AREA= 0.4572
+REF_LENGTH= 0.4572
+REF_ORIGIN_MOMENT_X= 0.0
+REF_ORIGIN_MOMENT_Y= 0.0
+REF_ORIGIN_MOMENT_Z= 0.0
+%
+% ------------------------- BOUNDARY CONDITIONS ------------------------- %
+%
+MARKER_HEATFLUX= ( wall, 0.0 )
+MARKER_FAR= ( farfield )
+MARKER_PLOTTING= ( wall )
+MARKER_MONITORING= ( wall )
+%
+% ---------------------------- FLUID MODELS ----------------------------- %
+%
+FLUID_MODEL= STANDARD_AIR
+GAMMA_VALUE= 1.4
+GAS_CONSTANT= 287.058
+%
+VISCOSITY_MODEL= SUTHERLAND
+MU_REF= 1.716E-5
+MU_T_REF= 273.15
+SUTHERLAND_CONSTANT= 110.4
+%
+CONDUCTIVITY_MODEL= CONSTANT_PRANDTL
+PRANDTL_LAM= 0.72
+PRANDTL_TURB= 0.90
+%
+% ----------------------- SPATIAL DISCRETIZATION ------------------------ %
+%
+NUM_METHOD_GRAD= GREEN_GAUSS
+CONV_NUM_METHOD_FLOW= ROE
+ENTROPY_FIX_COEFF= 1e-5
+MUSCL_FLOW= YES
+SLOPE_LIMITER_FLOW= VAN_ALBADA_EDGE
+%
+CONV_NUM_METHOD_TURB= SCALAR_UPWIND
+MUSCL_TURB= NO
+%
+% ---------- PSEUDOTIME INTEGRATION / CONVERGENCE ACCELERATION ---------- %
+%
+TIME_DISCRE_FLOW= EULER_IMPLICIT
+TIME_DISCRE_TURB= EULER_IMPLICIT
+%
+CFL_NUMBER= 2500
+CFL_REDUCTION_TURB= 1
+CFL_ADAPT= NO
+%
+DISCADJ_LIN_SOLVER= SMOOTHER
+DISCADJ_LIN_PREC= ILU
+LINEAR_SOLVER_ERROR= 1e-30
+LINEAR_SOLVER_ITER= 20
+LINEAR_SOLVER_SMOOTHER_RELAXATION= 0.6
+%
+MGLEVEL= 0
+%
+% Adjoint GMRES settings.
+MULTIZONE= YES
+NEWTON_KRYLOV= YES
+% These 3 numbers should usually be the same.
+INNER_ITER= 60
+QUASI_NEWTON_NUM_SAMPLES= 60
+LINEAR_SOLVER_RESTART_FREQUENCY= 60
+% This should be ~1/4 of the restart frequency. After the first outer
+% iteration, the solver does "frequency - deflation" inner iterations.
+LINEAR_SOLVER_RESTART_DEFLATION= 15
+%
+% ------------------------ CONVERGENCE CRITERIA ------------------------- %
+%
+OUTER_ITER= 10
+CONV_RESIDUAL_MINVAL= -8
+%
+% --------------------------- INPUT / OUTPUT ---------------------------- %
+%
+MESH_FILENAME= 2D_L1_coarse_r1.su2
+MESH_FORMAT= SU2
+RESTART_SOL= NO
+OUTPUT_WRT_FREQ= 1000
+SCREEN_WRT_FREQ_INNER= 1
+WRT_AD_STATISTICS= NO
+WRT_ZONE_CONV= NO
+WRT_ZONE_HIST= YES
+HISTORY_OUTPUT= ( ITER, RMS_RES, LINSOL, SENSITIVITY )
+SCREEN_OUTPUT= ( OUTER_ITER, ITER_TIME, RMS_RES[0], LINSOL_RESIDUAL[0], LINSOL_RESIDUAL_TURB[0], SENS_AOA[0], SENS_MACH[0] )

From 322d4d42e2b5278b86b60b7ffdd07e3852d06c3d Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Fri, 10 Apr 2026 12:38:31 +0100
Subject: [PATCH 5/7] fix the VkWk optimization

---
 Common/include/linear_algebra/CSysSolve.hpp  |  2 +-
 Common/include/linear_algebra/CSysVector.hpp |  5 ++-
 Common/src/linear_algebra/CSysSolve.cpp      | 41 +++++++++++---------
 Common/src/linear_algebra/CSysVector.cpp     |  4 +-
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index ec083535eef..86d89279ba3 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -115,7 +115,7 @@ class CSysSolve {
   mutable unsigned long k = 0;
   mutable std::vector<VectorType> Z, V; /*!< \brief Large matrices used by FGMRES, v^i+1 = A * z^i. */
   mutable std::vector<VectorType> W, T; /*!< \brief Large matrices used by FGCRODR for deflation vectors. */
-  mutable Eigen::Matrix<ScalarType, Eigen::Dynamic, Eigen::Dynamic> VWk;
+  mutable Eigen::Matrix<ScalarType, Eigen::Dynamic, Eigen::Dynamic> VkWk;
 
   /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
   VectorType LinSysSol_tmp;
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index 316b4d8e993..c0ab789d4ff 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -375,12 +375,13 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   /*!
    * \brief Computes the product of V^T W efficiencly, where V and W are tall matrices stored as vectors of CSysVector.
    * \param[in] V - Tall matrix.
-   * \param[in] n - Number of columns to consider from V.
+   * \param[in] i0 - First column of V to consider.
+   * \param[in] n - Number of columns to consider from V starting at i0.
    * \param[in] W - Tall matrix.
    * \param[in] m - Number of columns to consider from W.
    * \return n by m matrix with the result of the product.
    */
-  static const su2matrix<ScalarType>& multiDot(const std::vector<CSysVector>& V, size_t n,
+  static const su2matrix<ScalarType>& multiDot(const std::vector<CSysVector>& V, size_t i0, size_t n,
                                                const std::vector<CSysVector>& W, size_t m);
 
   /*!
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index 2141c263344..218479ee0ee 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -658,8 +658,9 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
   auto m = min(config->GetLinear_Solver_Restart_Frequency(), max_iter);
   const auto deflation = min(config->GetLinear_Solver_Restart_Deflation(), m - 1);
 
-  const bool masterRank = (SU2_MPI::GetRank() == MASTER_NODE);
   const bool flexible = !precond.IsIdentity();
+  const bool same_mat = mode == FgcrodrMode::SAME_MAT;
+  const bool masterRank = SU2_MPI::GetRank() == MASTER_NODE;
   /*--- If we call the solver outside of a parallel region, but the number of threads allows,
    * we still want to parallelize some of the expensive operations. ---*/
   const bool nestedParallel = !omp_in_parallel() && omp_get_max_threads() > 1;
@@ -732,7 +733,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     EigenMatrix R = EigenMatrix::Zero(k, k);
     EigenVector vr = EigenVector::Zero(k);
     for (auto j = 0ul; j < k; ++j) {
-      if (mode != FgcrodrMode::SAME_MAT) {
+      if (!same_mat) {
         /*--- When k = 0, Z = M(W), we could keep that property but it is not
          * critical and so we choose to save the cost of precond(W[j], Z[j]); ---*/
         mat_vec(GetZ(j), V[j]);
@@ -751,7 +752,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
 
     /*--- Apply R^-1 to Z and W and update x accordingly. R is uppper triangular,
      * so we loop backwards to compute the products in-place. ---*/
-    if (mode != FgcrodrMode::SAME_MAT) {
+    if (!same_mat) {
       EigenMatrix invR = R.template triangularView<Eigen::Upper>().solve(EigenMatrix::Identity(k, k));
       for (auto j = k - 1;; --j) {
         for (auto* basis : {&W, &Z}) {
@@ -871,23 +872,29 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     /*--- Compute Ritz values and keep the ones with the smallest real part. ---*/
 
     EigenMatrix VW = EigenMatrix::Identity(m + 1, m);
-    if (mode != FgcrodrMode::SAME_MAT) {
+    {
+      /*--- Part of VW known from previous cycle. See notes near the end of the outer loop. ---*/
+      if (same_mat && k > 0) VW.topLeftCorner(k, k) = VkWk;
+
+      /*--- Rest of VW. Either V[k] * Wk or the entire V * Wk depending on the mode.
+       * When the matrix stays constant, V[k+1:m+1] are orthogonal to Wk, but when it changes,
+       * we need to compute that part of the product. Since m >> k, there is less benefit in
+       * avoiding the cost of V[0:k] * Wk and we opt to make the code a little simpler. ---*/
       const su2matrix<ScalarType>* VWk = nullptr;
+      const auto i0 = same_mat ? k : 0;
+      const auto n = same_mat ? 1 : m + 1;
       if (nestedParallel) {
         SU2_OMP_PARALLEL
-        VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+        VWk = &CSysVector<ScalarType>::multiDot(V, i0, n, W, k);
         END_SU2_OMP_PARALLEL
       } else {
-        VWk = &CSysVector<ScalarType>::multiDot(V, m + 1, W, k);
+        VWk = &CSysVector<ScalarType>::multiDot(V, i0, n, W, k);
       }
-      for (auto i = 0ul; i <= m; ++i) {
+      for (auto i = 0ul; i < n; ++i) {
         for (auto j = 0ul; j < k; ++j) {
-          VW(i, j) = (*VWk)(i, j);
+          VW(i0 + i, j) = (*VWk)(i, j);
         }
       }
-    } else if (k > 0) {
-      /*--- See notes near the end of the outer loop. ---*/
-      VW.topLeftCorner(k, k) = VWk.topRows(k);
     }
     const auto Hm = Heigen.topLeftCorner(m + 1, m);
     EigenMatrix HTVW = Hm.transpose() * VW;
@@ -954,13 +961,9 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     modify(PinvR, GetW);
 
     BEGIN_SU2_OMP_SAFE_GLOBAL_ACCESS {
-      /*--- Initialize VWk, then apply the V and W modifications of the left and right, respectively. ---*/
-      if (mode == FgcrodrMode::SAME_MAT) {
-        if (k == 0) {
-          VWk = EigenMatrix::Identity(m + 1, k_new);
-        }
-        VWk.topRows(k) = Q.transpose() * (VWk * PinvR);
-      }
+      /*--- Apply the V and W modifications to the left and right of the current VW, respectively. ---*/
+      if (same_mat) VkWk.noalias() = Q.transpose() * (VW * PinvR);
+
       /*--- T and W are the same size, so we can swap them. ---*/
       std::swap(T, W);
       k = k_new;
@@ -981,7 +984,7 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
     }
 
     /*--- Update V only if necessary. ---*/
-    if (!converged || mode == FgcrodrMode::SAME_MAT) {
+    if (!converged || same_mat) {
       modify(Q, V);
       update(V);
     }
diff --git a/Common/src/linear_algebra/CSysVector.cpp b/Common/src/linear_algebra/CSysVector.cpp
index 5d0009f39fa..8f767f44ab4 100644
--- a/Common/src/linear_algebra/CSysVector.cpp
+++ b/Common/src/linear_algebra/CSysVector.cpp
@@ -69,7 +69,7 @@ void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numB
 
 template <class ScalarType>
 const su2matrix<ScalarType>& CSysVector<ScalarType>::multiDot(const std::vector<CSysVector<ScalarType>>& V,
-                                                              const size_t n,
+                                                              const size_t i0, const size_t n,
                                                               const std::vector<CSysVector<ScalarType>>& W,
                                                               const size_t m) {
   static constexpr size_t BLOCK_SIZE = 1024;
@@ -87,7 +87,7 @@ const su2matrix<ScalarType>& CSysVector<ScalarType>::multiDot(const std::vector<
   for (size_t offset = 0; offset < size; offset += BLOCK_SIZE) {
     const auto limit = std::min(offset + BLOCK_SIZE, size);
     for (size_t i = 0; i < n; ++i) {
-      const auto& vi = V[i];
+      const auto& vi = V[i0 + i];
       for (size_t j = 0; j < m; ++j) {
         const auto& wj = W[j];
         ScalarType sum = 0.0;

From 823742a084217b2250a769d04e8f4e5ae2ad137d Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 11 Apr 2026 08:05:40 +0100
Subject: [PATCH 6/7] simplify settings

---
 Common/include/linear_algebra/CSysSolve.hpp     | 17 +++++++++--------
 Common/src/linear_algebra/CSysSolve.cpp         | 11 ++++++-----
 SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp |  5 +++--
 .../disc_adj_fsi/Airfoil_2d/configFlow.cfg      | 10 +++++-----
 TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg   |  5 ++---
 TestCases/parallel_regression.py                |  1 +
 TestCases/vandv.py                              |  4 ++--
 TestCases/vandv/rans/30p30n/config_ad.cfg       |  9 ++++-----
 8 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index 86d89279ba3..84b63bcfb4f 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -298,8 +298,8 @@ class CSysSolve {
   template <class Dummy = int>
   unsigned long FGCRODR_LinSolverImpl(const VectorType& b, VectorType& x, const ProductType& mat_vec,
                                       const PrecondType& precond, ScalarType tol, unsigned long max_iter,
-                                      ScalarType& residual, bool monitoring, const CConfig* config,
-                                      FgcrodrMode mode) const;
+                                      ScalarType& residual, bool monitoring, const CConfig* config, FgcrodrMode mode,
+                                      unsigned long custom_m) const;
 
   /*!
    * \brief Creates the inner solver for nested preconditioning if the settings allow it.
@@ -322,7 +322,7 @@ class CSysSolve {
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
    * \param[out] residual - final normalized residual
-   * \param[in] monitoring - turn on priting residuals from solver to screen.
+   * \param[in] monitoring - turn on priting residuals from solver to screen
    * \param[in] config - Definition of the particular problem.
    */
   unsigned long CG_LinSolver(const VectorType& b, VectorType& x, const ProductType& mat_vec, const PrecondType& precond,
@@ -338,7 +338,7 @@ class CSysSolve {
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
    * \param[out] residual - final normalized residual
-   * \param[in] monitoring - turn on priting residuals from solver to screen.
+   * \param[in] monitoring - turn on priting residuals from solver to screen
    * \param[in] config - Definition of the particular problem.
    */
   unsigned long FGMRES_LinSolver(const VectorType& b, VectorType& x, const ProductType& mat_vec,
@@ -361,14 +361,15 @@ class CSysSolve {
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] max_iter - maximum number of iterations
    * \param[out] residual - final normalized residual
-   * \param[in] monitoring - turn on priting residuals from solver to screen.
+   * \param[in] monitoring - turn on priting residuals from solver to screen
    * \param[in] config - Definition of the particular problem.
    * \param[in] mode - See FgcrodrMode.
+   * \param[in] custom_m - alternative maximum size of the search subspace, overrides the config value if != 0.
    */
   unsigned long FGCRODR_LinSolver(const VectorType& b, VectorType& x, const ProductType& mat_vec,
                                   const PrecondType& precond, ScalarType tol, unsigned long max_iter,
                                   ScalarType& residual, bool monitoring, const CConfig* config,
-                                  FgcrodrMode mode = FgcrodrMode::NORMAL) const;
+                                  FgcrodrMode mode = FgcrodrMode::NORMAL, unsigned long custom_m = 0) const;
 
   /*!
    * \brief Biconjugate Gradient Stabilized Method (BCGSTAB)
@@ -379,7 +380,7 @@ class CSysSolve {
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
    * \param[out] residual - final normalized residual
-   * \param[in] monitoring - turn on priting residuals from solver to screen.
+   * \param[in] monitoring - turn on priting residuals from solver to screen
    * \param[in] config - Definition of the particular problem.
    */
   unsigned long BCGSTAB_LinSolver(const VectorType& b, VectorType& x, const ProductType& mat_vec,
@@ -395,7 +396,7 @@ class CSysSolve {
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum number of iterations
    * \param[out] residual - final normalized residual
-   * \param[in] monitoring - turn on priting residuals from solver to screen.
+   * \param[in] monitoring - turn on priting residuals from solver to screen
    * \param[in] config - Definition of the particular problem.
    */
   unsigned long Smoother_LinSolver(const VectorType& b, VectorType& x, const ProductType& mat_vec,
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index 218479ee0ee..e093452cfb9 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -650,12 +650,12 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolverImpl(const CSysVector<Scal
                                                            const CMatrixVectorProduct<ScalarType>& mat_vec,
                                                            const CPreconditioner<ScalarType>& precond, ScalarType tol,
                                                            unsigned long max_iter, ScalarType& residual,
-                                                           bool monitoring, const CConfig* config,
-                                                           FgcrodrMode mode) const {
+                                                           bool monitoring, const CConfig* config, FgcrodrMode mode,
+                                                           unsigned long custom_m) const {
   using EigenMatrix = Eigen::Matrix<ScalarType, Eigen::Dynamic, Eigen::Dynamic>;
   using EigenVector = Eigen::Matrix<ScalarType, Eigen::Dynamic, 1>;
 
-  auto m = min(config->GetLinear_Solver_Restart_Frequency(), max_iter);
+  auto m = min(custom_m != 0 ? custom_m : config->GetLinear_Solver_Restart_Frequency(), max_iter);
   const auto deflation = min(config->GetLinear_Solver_Restart_Deflation(), m - 1);
 
   const bool flexible = !precond.IsIdentity();
@@ -1021,9 +1021,10 @@ unsigned long CSysSolve<ScalarType>::FGCRODR_LinSolver(const CSysVector<ScalarTy
                                                        const CMatrixVectorProduct<ScalarType>& mat_vec,
                                                        const CPreconditioner<ScalarType>& precond, ScalarType tol,
                                                        unsigned long max_iter, ScalarType& residual, bool monitoring,
-                                                       const CConfig* config, [[maybe_unused]] FgcrodrMode mode) const {
+                                                       const CConfig* config, [[maybe_unused]] FgcrodrMode mode,
+                                                       [[maybe_unused]] unsigned long custom_m) const {
   if constexpr (std::is_same_v<ScalarType, float> || std::is_same_v<ScalarType, double>) {
-    return FGCRODR_LinSolverImpl<>(b, x, mat_vec, precond, tol, max_iter, residual, monitoring, config, mode);
+    return FGCRODR_LinSolverImpl<>(b, x, mat_vec, precond, tol, max_iter, residual, monitoring, config, mode, custom_m);
   } else {
     return RFGMRES_LinSolver(b, x, mat_vec, precond, tol, max_iter, residual, monitoring, config);
   }
diff --git a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
index b31926271cf..c41544a8f3a 100644
--- a/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
+++ b/SU2_CFD/src/drivers/CDiscAdjMultizoneDriver.cpp
@@ -382,7 +382,7 @@ void CDiscAdjMultizoneDriver::KrylovInnerIters(unsigned short iZone) {
     auto iter = min(totalIter-2ul, config_container[iZone]->GetnQuasiNewtonSamples()-2ul);
     iter = LinSolver[iZone].FGCRODR_LinSolver(AdjRHS[iZone], AdjSol[iZone], product, Identity(),
                                               tol_l, iter, eps_l, monitor, config_container[iZone],
-                                              FgcrodrMode::SAME_MAT);
+                                              FgcrodrMode::SAME_MAT, iter);
     totalIter -= iter+1;
     eps *= eps_l;
   }
@@ -416,7 +416,7 @@ void CDiscAdjMultizoneDriver::Run() {
   /*--- Temporary warning because we need to test writing intermediate output to file (requires re-recording). ---*/
   for(iZone = 0; iZone < nZone; iZone++) {
     for (auto iVolumeFreq = 0; iVolumeFreq < config_container[iZone]->GetnVolumeOutputFrequencies(); iVolumeFreq++){
-      if (config_container[iZone]->GetVolumeOutputFrequency(iVolumeFreq) < nOuterIter) {
+      if (!time_domain && config_container[iZone]->GetVolumeOutputFrequency(iVolumeFreq) < nOuterIter) {
         if (rank == MASTER_NODE) {
           cout << "\nWARNING (iZone = " << iZone
                << "): "
@@ -574,6 +574,7 @@ void CDiscAdjMultizoneDriver::Run() {
   }
 
   if (time_domain) {
+    for (const auto& ls : LinSolver) ls.ResetDeflation();
     EvaluateSensitivities(TimeIter, (TimeIter+1) == driver_config->GetnTime_Iter());
   }
 
diff --git a/TestCases/disc_adj_fsi/Airfoil_2d/configFlow.cfg b/TestCases/disc_adj_fsi/Airfoil_2d/configFlow.cfg
index 1c62889d0b6..96fe4fb0419 100755
--- a/TestCases/disc_adj_fsi/Airfoil_2d/configFlow.cfg
+++ b/TestCases/disc_adj_fsi/Airfoil_2d/configFlow.cfg
@@ -49,13 +49,13 @@ JST_SENSOR_COEFF= ( 0.5, 0.02 )
 TIME_DISCRE_FLOW= EULER_IMPLICIT
 
 % Linear solvers ------------------------------------------------------- %
-LINEAR_SOLVER= FGMRES
-LINEAR_SOLVER_PREC= ILU
-LINEAR_SOLVER_ERROR= 1E-4
-LINEAR_SOLVER_ITER= 50
 DISCADJ_LIN_SOLVER= FGMRES
 DISCADJ_LIN_PREC= ILU
+LINEAR_SOLVER_ERROR= 1e-4
+LINEAR_SOLVER_ITER= 50
+%
 NEWTON_KRYLOV= YES
+LINEAR_SOLVER_RESTART_DEFLATION= 8
 QUASI_NEWTON_NUM_SAMPLES= 999
 %
 DEFORM_LINEAR_SOLVER= CONJUGATE_GRADIENT
@@ -71,7 +71,7 @@ TIME_ITER= 1
 BGS_RELAXATION= FIXED_PARAMETER
 STAT_RELAX_PARAMETER= 1.0
 % fluid
-INNER_ITER= 51
+INNER_ITER= 41
 CONV_STARTITER= 0
 CONV_RESIDUAL_MINVAL= -9
 
diff --git a/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg b/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
index f5c233ec9cc..a59a5b0ce8f 100644
--- a/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
+++ b/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
@@ -54,9 +54,7 @@ ENTROPY_FIX_COEFF= 0.01
 TIME_DISCRE_FLOW= EULER_IMPLICIT
 
 % Linear solvers ------------------------------------------------------- %
-LINEAR_SOLVER= FGMRES
-LINEAR_SOLVER_PREC= ILU
-LINEAR_SOLVER_ERROR= 1E-6
+LINEAR_SOLVER_ERROR= 1E-30
 LINEAR_SOLVER_ITER= 25
 DISCADJ_LIN_SOLVER= SMOOTHER
 DISCADJ_LIN_PREC= ILU
@@ -64,6 +62,7 @@ LINEAR_SOLVER_SMOOTHER_RELAXATION= 0.7
 MGLEVEL= 0
 NEWTON_KRYLOV= YES
 QUASI_NEWTON_NUM_SAMPLES= 999
+LINEAR_SOLVER_RESTART_DEFLATION= 0
 %
 DEFORM_LINEAR_SOLVER= CONJUGATE_GRADIENT
 DEFORM_LINEAR_SOLVER_PREC= ILU
diff --git a/TestCases/parallel_regression.py b/TestCases/parallel_regression.py
index 2925725d1eb..c619d8b5ea4 100755
--- a/TestCases/parallel_regression.py
+++ b/TestCases/parallel_regression.py
@@ -1323,6 +1323,7 @@ def main():
     nonlinear_plane_stress.cfg_file = "nonlinear_plane_stress_2d.cfg"
     nonlinear_plane_stress.test_iter = 19
     nonlinear_plane_stress.test_vals = [-7.433449, -3.355607, -13.983863, 162480, 43, -4.070373]
+    nonlinear_plane_stress.tol = [2e-4, 2e-4, 2e-4, 1e-5, 1e-5, 4e-4]
     test_list.append(nonlinear_plane_stress)
 
     # Dynamic beam, 2d
diff --git a/TestCases/vandv.py b/TestCases/vandv.py
index 1eac30f0499..4c54ff4228b 100644
--- a/TestCases/vandv.py
+++ b/TestCases/vandv.py
@@ -28,7 +28,7 @@
 # You should have received a copy of the GNU Lesser General Public
 # License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 
-import sys, os
+import sys, shutil
 from TestCase import TestCase
 
 def main():
@@ -48,7 +48,7 @@ def main():
     p30n30.test_vals = [-11.267106, -11.168215, -11.182822, -10.949673, -14.233489, 0.052235, 2.830394, 1.318894, -1.210648, 1, 1.2763e+01]
     test_list.append(p30n30)
 
-    os.symlink("vandv/rans/30p30n/solution.dat", "vandv/rans/30p30n/solution_0.dat")
+    shutil.copy("vandv/rans/30p30n/solution.dat", "vandv/rans/30p30n/solution_0.dat")
     p30n30_ad = TestCase('30P30N_ad')
     p30n30_ad.cfg_dir = "vandv/rans/30p30n"
     p30n30_ad.cfg_file = "config_ad.cfg"
diff --git a/TestCases/vandv/rans/30p30n/config_ad.cfg b/TestCases/vandv/rans/30p30n/config_ad.cfg
index 263be10bab4..5892f8d5520 100644
--- a/TestCases/vandv/rans/30p30n/config_ad.cfg
+++ b/TestCases/vandv/rans/30p30n/config_ad.cfg
@@ -77,13 +77,12 @@ MGLEVEL= 0
 % Adjoint GMRES settings.
 MULTIZONE= YES
 NEWTON_KRYLOV= YES
-% These 3 numbers should usually be the same.
-INNER_ITER= 60
-QUASI_NEWTON_NUM_SAMPLES= 60
-LINEAR_SOLVER_RESTART_FREQUENCY= 60
+QUASI_NEWTON_NUM_SAMPLES= 999
+% Acts as the restart frequency.
+INNER_ITER= 30
 % This should be ~1/4 of the restart frequency. After the first outer
 % iteration, the solver does "frequency - deflation" inner iterations.
-LINEAR_SOLVER_RESTART_DEFLATION= 15
+LINEAR_SOLVER_RESTART_DEFLATION= 5
 %
 % ------------------------ CONVERGENCE CRITERIA ------------------------- %
 %

From 2c55e8e01fecb733435005193bcab2c0708e29fa Mon Sep 17 00:00:00 2001
From: Pedro Gomes <pcarruscag@gmail.com>
Date: Sat, 11 Apr 2026 11:50:50 +0100
Subject: [PATCH 7/7] update

---
 TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg | 2 +-
 TestCases/parallel_regression_AD.py           | 6 ++++--
 TestCases/vandv.py                            | 5 +++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg b/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
index a59a5b0ce8f..edcd5c2dc31 100644
--- a/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
+++ b/TestCases/disc_adj_fsi/dyn_fsi/configFlow.cfg
@@ -62,7 +62,7 @@ LINEAR_SOLVER_SMOOTHER_RELAXATION= 0.7
 MGLEVEL= 0
 NEWTON_KRYLOV= YES
 QUASI_NEWTON_NUM_SAMPLES= 999
-LINEAR_SOLVER_RESTART_DEFLATION= 0
+LINEAR_SOLVER_RESTART_DEFLATION= 5
 %
 DEFORM_LINEAR_SOLVER= CONJUGATE_GRADIENT
 DEFORM_LINEAR_SOLVER_PREC= ILU
diff --git a/TestCases/parallel_regression_AD.py b/TestCases/parallel_regression_AD.py
index 9e4863cc42b..2c2369c6e9e 100644
--- a/TestCases/parallel_regression_AD.py
+++ b/TestCases/parallel_regression_AD.py
@@ -286,8 +286,8 @@ def main():
     discadj_fsi2.cfg_dir   = "disc_adj_fsi/Airfoil_2d"
     discadj_fsi2.cfg_file  = "config.cfg"
     discadj_fsi2.test_iter = 8
-    discadj_fsi2.test_vals         = [-4.773024, 0.915849, -3.863369, 0.295450, 3.839800]
-    discadj_fsi2.test_vals_aarch64 = [-4.772641, 0.917601, -3.863369, 0.295450, 3.839800]
+    discadj_fsi2.test_vals         = [-3.824870, 1.979160, -3.863368, 0.295450, 3.839800]
+    discadj_fsi2.test_vals_aarch64 = [-3.824870, 1.979160, -3.863368, 0.295450, 3.839800]
     discadj_fsi2.tol       = 0.00001
     test_list.append(discadj_fsi2)
 
@@ -559,6 +559,8 @@ def main():
     dyn_discadj_fsi.reference_file = "grad_dv.opt.ref"
     dyn_discadj_fsi.reference_file_aarch64 = "grad_dv_aarch64.opt.ref"
     dyn_discadj_fsi.test_file = "grad_young.opt"
+    dyn_discadj_fsi.comp_threshold = 1e-6
+    dyn_discadj_fsi.tol_file_percent = 0.1
     dyn_discadj_fsi.unsteady  = True
     pass_list.append(dyn_discadj_fsi.run_filediff())
     test_list.append(dyn_discadj_fsi)
diff --git a/TestCases/vandv.py b/TestCases/vandv.py
index 4c54ff4228b..ea6855b661d 100644
--- a/TestCases/vandv.py
+++ b/TestCases/vandv.py
@@ -48,12 +48,13 @@ def main():
     p30n30.test_vals = [-11.267106, -11.168215, -11.182822, -10.949673, -14.233489, 0.052235, 2.830394, 1.318894, -1.210648, 1, 1.2763e+01]
     test_list.append(p30n30)
 
+    # This is not part of the V&V cases yet, its tested in this script because it is a relatively long test (~1 min).
     shutil.copy("vandv/rans/30p30n/solution.dat", "vandv/rans/30p30n/solution_0.dat")
     p30n30_ad = TestCase('30P30N_ad')
     p30n30_ad.cfg_dir = "vandv/rans/30p30n"
     p30n30_ad.cfg_file = "config_ad.cfg"
-    p30n30_ad.test_iter = 5
-    p30n30_ad.test_vals = [-8.167332, -8.738471, -8.762033, -8.500107, -7.433292, -0.808628, -2.091805, 1.1791e-01, 3.4123e+01]
+    p30n30_ad.test_iter = 9
+    p30n30_ad.test_vals = [-7.283709, -6.072615, -5.995304, -7.197048, -4.568373, -1.167146, -2.316777, 1.1791e-01, 3.4123e+01]
     p30n30_ad.command = TestCase.Command("mpirun -n 2", "SU2_CFD_AD")
     test_list.append(p30n30_ad)