From bd66fccee4bddfb9334fd419444665abeaea6034 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Tue, 12 Apr 2022 15:03:47 +0200
Subject: [PATCH 01/11] add `MP_SMALL_STACK_SIZE` option

This adds an option to use a heap-buffer for the usually stack-based
`MP_WARRAY`-sized temporary buffers.

Per default it will reserve a single buffer, which can be modified
* at compile-time via the `MP_WARRAY_NUM` define
* at run-time by calling `mp_warray_init()`

The internal structure can only be created once. If one wants to modify
the maximum number of elements, the entire structure has to be free'd
by calling `mp_warray_free()`.

In case one wants to use this option with multiple threads, one shall
use the `mp_warray_init()` function and pass appropriate locking functions.

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 .github/workflows/main.yml     |  6 ++++
 CMakeLists.txt                 |  2 +-
 demo/test.c                    | 31 ++++++++++++++--
 doc/bn.tex                     | 64 +++++++++++++++++++++++++++++++++-
 helper.pl                      |  2 +-
 mp_warray_free.c               | 36 +++++++++++++++++++
 mp_warray_init.c               | 55 +++++++++++++++++++++++++++++
 s_mp_montgomery_reduce_comba.c |  7 +++-
 s_mp_mul_comba.c               |  7 +++-
 s_mp_mul_high_comba.c          |  7 +++-
 s_mp_sqr_comba.c               |  6 +++-
 s_mp_warray.c                  |  8 +++++
 s_mp_warray_free.c             | 17 +++++++++
 s_mp_warray_get.c              | 33 ++++++++++++++++++
 s_mp_warray_put.c              | 20 +++++++++++
 tommath.def                    |  2 ++
 tommath.h                      | 28 +++++++++++++++
 tommath_class.h                | 36 +++++++++++++++++++
 tommath_private.h              | 36 +++++++++++++++++++
 tommath_superclass.h           |  2 ++
 20 files changed, 396 insertions(+), 9 deletions(-)
 create mode 100644 mp_warray_free.c
 create mode 100644 mp_warray_init.c
 create mode 100644 s_mp_warray.c
 create mode 100644 s_mp_warray_free.c
 create mode 100644 s_mp_warray_get.c
 create mode 100644 s_mp_warray_put.c

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 24f881f3d..6fcd5d466 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -70,6 +70,12 @@ jobs:
           # RSA superclass with tests (no sanitizer, but debug info)
           - { BUILDOPTIONS: '--with-cc=gcc --with-m64 --cflags=-DLTM_NOTHING --cflags=-DSC_RSA_1_WITH_TESTS --limit-valgrind',   SANITIZER: '',  COMPILE_DEBUG: '1', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: '' }
 
+          # Build with small stack-size
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                                SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING',       SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                           SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',       OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
+
           # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --limit-valgrind --make-option=tune'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d60632777..2f59d32e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,7 +138,7 @@ if(COMPILE_LTO)
     if(COMPILER_SUPPORTS_LTO)
         set_property(TARGET ${PROJECT_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
     else()
-        message(SEND_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
+        message(FATAL_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
     endif()
 endif()
 
diff --git a/demo/test.c b/demo/test.c
index f290dbf21..4b1d30f86 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2451,6 +2451,21 @@ static int test_mp_pack_unpack(void)
    return EXIT_FAILURE;
 }
 
+
+#ifdef MP_TEST_LOCKING
+#ifdef MP_NO_LOCKING
+#error "Can't test locking when locking is disabled"
+#endif
+static mp_lock lock_ctx;
+static int noop_lock_unlock(void *ctx)
+{
+   EXPECT(ctx == &lock_ctx);
+   return 0;
+LBL_ERR:
+   return -1;
+}
+#endif
+
 #ifndef LTM_TEST_DYNAMIC
 #define ONLY_PUBLIC_API_C
 #endif
@@ -2525,7 +2540,14 @@ static int unit_tests(int argc, char **argv)
    unsigned long i, ok, fail, nop;
    uint64_t t;
    int j;
+#ifdef MP_TEST_LOCKING
+   lock_ctx.lock = noop_lock_unlock;
+   lock_ctx.unlock = noop_lock_unlock;
+   lock_ctx.ctx = &lock_ctx;
 
+   if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY)
+      return EXIT_FAILURE;
+#endif
    ok = fail = nop = 0;
 
    t = (uint64_t)time(NULL);
@@ -2533,6 +2555,7 @@ static int unit_tests(int argc, char **argv)
    s_mp_rand_jenkins_init(t);
    mp_rand_source(s_mp_rand_jenkins);
 
+
    for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
       if (argc > 1) {
          for (j = 1; j < argc; ++j) {
@@ -2556,8 +2579,12 @@ static int unit_tests(int argc, char **argv)
    }
    fprintf(fail?stderr:stdout, "Tests OK/NOP/FAIL: %lu/%lu/%lu\n", ok, nop, fail);
 
-   if (fail != 0) return EXIT_FAILURE;
-   else return EXIT_SUCCESS;
+   EXPECT(mp_warray_free() != -2);
+
+   if (fail == 0)
+      return EXIT_SUCCESS;
+LBL_ERR:
+   return EXIT_FAILURE;
 }
 
 int main(int argc, char **argv)
diff --git a/doc/bn.tex b/doc/bn.tex
index 22ae5f3eb..185876335 100644
--- a/doc/bn.tex
+++ b/doc/bn.tex
@@ -352,6 +352,20 @@ \subsubsection{Operand Size Related}
   \end{center}
 \end{small}
 
+\subsection{Small-Stack option}
+\label{ch:SMALL_STACK_INTRO}
+The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in
+the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap.
+This comes with one problem, namely: formerly promised thread-safety isn't given anymore.
+Therefore if the Small-Stack option is enabled while doing multi threading, the provided locking
+mechanism shall be used.
+For some use cases it can be desired to use the Small-Stack option, but there are no threads and
+therefore we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}.
+
+In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can
+be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic
+initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details.
+
 \section{Purpose of LibTomMath}
 Unlike	GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath
 was not written with bleeding edge performance in mind.  First and foremost LibTomMath was written
@@ -428,7 +442,9 @@ \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
 In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library
 file (typically
-libtommath.a).	There is no library initialization required and the entire library is thread safe.
+libtommath.a).	There is no library initialization required and the entire library is thread safe
+if it is used in its default configuration. Locking is recommended if the small-stack option
+is enabled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API}
 
 \section{Return Codes}
 There are five possible return codes a function may return.
@@ -813,6 +829,52 @@ \subsection{Adding additional digits}
 \end{alltt}
 \end{small}
 
+\section{Small-Stack option}
+\label{ch:SMALL_STACK_API}
+
+In case the \texttt{MP\_SMALL\_STACK\_SIZE} symbol is defined the following functions
+can be useful.
+
+To initialize the internal structure the following function shall be called.
+
+\index{mp\_warray\_init}
+\begin{alltt}
+mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
+\end{alltt}
+
+The flag \texttt{preallocate} controls whether the internal buffers --
+\texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when
+\texttt{mp\_warray\_init()} is called, or whether they will be allocated when required.
+The \texttt{mp\_lock} struct looks as follows and shall be used to protect the
+internal structure when using the library in a multi-threaded application.
+
+\index{mp\_lock}
+\begin{alltt}
+typedef struct {
+   int (*lock)(void *ctx);
+   int (*unlock)(void *ctx);
+   void *ctx;
+} mp_lock;
+\end{alltt}
+
+The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp.
+after modifying the internal struct.
+The \texttt{mp\_lock.ctx} element will be passed to those functions.
+
+To free the internally allocated memory the following function shall be called.
+
+\index{mp\_warray\_free}
+\begin{alltt}
+int mp_warray_free(void);
+\end{alltt}
+
+
+Those two API functions are always available, even if the \texttt{MP\_SMALL\_STACK\_SIZE} option
+has been disabled at compile time.
+In that case \texttt{mp\_warray\_init()} will return \texttt{MP\_ERR} and \texttt{mp\_warray\_free()}
+will return $-1$.
+
+
 \chapter{Basic Operations}
 \section{Copying}
 
diff --git a/helper.pl b/helper.pl
index 53658614c..ffc592a7c 100755
--- a/helper.pl
+++ b/helper.pl
@@ -394,7 +394,7 @@ sub update_dep
     foreach my $filename (glob '*mp_*.c') {
         my $content;
         my $cc = $ENV{'CC'} || 'gcc';
-        $content = `$cc -E -x c -DLTM_ALL $filename`;
+        $content = `$cc -E -x c -DLTM_ALL -DMP_SMALL_STACK_SIZE $filename`;
         $content =~ s/^# 1 "$filename".*?^# 2 "$filename"//ms;
 
         # convert filename to upper case so we can use it as a define
diff --git a/mp_warray_free.c b/mp_warray_free.c
new file mode 100644
index 000000000..4b01282a0
--- /dev/null
+++ b/mp_warray_free.c
@@ -0,0 +1,36 @@
+#include "tommath_private.h"
+#ifdef MP_WARRAY_FREE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* static check that the multiplication won't overflow */
+MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) >= MP_WARRAY)
+
+static int s_warray_free(void)
+{
+   int ret = 0;
+   size_t n;
+   S_MP_WARRAY_LOCK();
+   for (n = 0; n < s_mp_warray.allocated; ++n) {
+      if (s_mp_warray.l_used[n].warray) {
+         ret = -2;
+         goto ERR_OUT;
+      }
+   }
+   for (n = 0; n < s_mp_warray.allocated; ++n) {
+      MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY);
+      s_mp_warray.l_free[n].warray = NULL;
+   }
+   s_mp_warray_free(s_mp_warray.usable);
+ERR_OUT:
+   S_MP_WARRAY_UNLOCK();
+   return ret;
+}
+
+int mp_warray_free(void)
+{
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_free();
+   return -1;
+}
+
+#endif
diff --git a/mp_warray_init.c b/mp_warray_init.c
new file mode 100644
index 000000000..0ff93aa53
--- /dev/null
+++ b/mp_warray_init.c
@@ -0,0 +1,55 @@
+#include "tommath_private.h"
+#ifdef MP_WARRAY_INIT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
+{
+   size_t n;
+   if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) {
+      return MP_VAL;
+   }
+
+   if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) {
+      if (lock->lock == NULL || lock->unlock == NULL)
+         return MP_VAL;
+      s_mp_warray.lock = *lock;
+      s_mp_warray.locking_enabled = true;
+   } else {
+      s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock));
+   }
+
+   s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free)));
+   s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used)));
+   if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) {
+      s_mp_warray_free(n_alloc);
+      return MP_MEM;
+   }
+
+   if (preallocate) {
+      for (n = 0; n < n_alloc; ++n) {
+         s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
+         if (s_mp_warray.l_free[n].warray == NULL) {
+            while (n > 0) {
+               n--;
+               MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word));
+               s_mp_warray.l_free[n].warray = NULL;
+            }
+            s_mp_warray_free(n_alloc);
+            return MP_MEM;
+         }
+      }
+      s_mp_warray.allocated = n_alloc;
+   }
+
+   s_mp_warray.usable = n_alloc;
+   return MP_OKAY;
+}
+
+mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
+{
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock);
+   return MP_ERR;
+}
+
+#endif
diff --git a/s_mp_montgomery_reduce_comba.c b/s_mp_montgomery_reduce_comba.c
index 7472caf34..3858f75a0 100644
--- a/s_mp_montgomery_reduce_comba.c
+++ b/s_mp_montgomery_reduce_comba.c
@@ -15,9 +15,12 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 {
    int     ix, oldused;
    mp_err  err;
-   mp_word W[MP_WARRAY];
+   mp_word MP_ALLOC_WARRAY(W);
+
+   MP_CHECK_WARRAY(W);
 
    if (x->used > MP_WARRAY) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
@@ -26,6 +29,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 
    /* grow a as required */
    if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -110,6 +114,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
 
    mp_clamp(x);
 
+   MP_FREE_WARRAY(W);
    /* if A >= m then A = A - m */
    if (mp_cmp_mag(x, n) != MP_LT) {
       return s_mp_sub(x, n, x);
diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c
index ca89ff9dd..5b37035ea 100644
--- a/s_mp_mul_comba.c
+++ b/s_mp_mul_comba.c
@@ -23,15 +23,19 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
 {
    int      oldused, pa, ix;
    mp_err   err;
-   mp_digit W[MP_WARRAY];
+   mp_digit MP_ALLOC_WARRAY(W);
    mp_word  _W;
 
+   MP_CHECK_WARRAY(W);
+
    if (digs < 0) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
    /* grow the destination as required */
    if ((err = mp_grow(c, digs)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -77,6 +81,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
    s_mp_zero_digs(c->dp + c->used, oldused - c->used);
 
    mp_clamp(c);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_mul_high_comba.c b/s_mp_mul_high_comba.c
index b5ac06d74..b0096d4e6 100644
--- a/s_mp_mul_high_comba.c
+++ b/s_mp_mul_high_comba.c
@@ -16,16 +16,20 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
 {
    int     oldused, pa, ix;
    mp_err   err;
-   mp_digit W[MP_WARRAY];
+   mp_digit MP_ALLOC_WARRAY(W);
    mp_word  _W;
 
+   MP_CHECK_WARRAY(W);
+
    if (digs < 0) {
+      MP_FREE_WARRAY(W);
       return MP_VAL;
    }
 
    /* grow the destination as required */
    pa = a->used + b->used;
    if ((err = mp_grow(c, pa)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -69,6 +73,7 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
    s_mp_zero_digs(c->dp + c->used, oldused - c->used);
 
    mp_clamp(c);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_sqr_comba.c b/s_mp_sqr_comba.c
index 1bcc1f93f..336a0a082 100644
--- a/s_mp_sqr_comba.c
+++ b/s_mp_sqr_comba.c
@@ -16,13 +16,16 @@ After that loop you do the squares and add them in.
 mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b)
 {
    int       oldused, pa, ix;
-   mp_digit  W[MP_WARRAY];
+   mp_digit  MP_ALLOC_WARRAY(W);
    mp_word   W1;
    mp_err err;
 
+   MP_CHECK_WARRAY(W);
+
    /* grow the destination as required */
    pa = a->used + a->used;
    if ((err = mp_grow(b, pa)) != MP_OKAY) {
+      MP_FREE_WARRAY(W);
       return err;
    }
 
@@ -82,6 +85,7 @@ mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b)
    s_mp_zero_digs(b->dp + b->used, oldused - b->used);
 
    mp_clamp(b);
+   MP_FREE_WARRAY(W);
    return MP_OKAY;
 }
 #endif
diff --git a/s_mp_warray.c b/s_mp_warray.c
new file mode 100644
index 000000000..d181057cb
--- /dev/null
+++ b/s_mp_warray.c
@@ -0,0 +1,8 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+st_warray s_mp_warray;
+
+#endif
diff --git a/s_mp_warray_free.c b/s_mp_warray_free.c
new file mode 100644
index 000000000..9d8b75eb1
--- /dev/null
+++ b/s_mp_warray_free.c
@@ -0,0 +1,17 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_FREE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+void s_mp_warray_free(size_t n)
+{
+   (void)n;
+   MP_FREE(s_mp_warray.l_free, n * sizeof(*(s_mp_warray.l_free)));
+   MP_FREE(s_mp_warray.l_used, n * sizeof(*(s_mp_warray.l_used)));
+   s_mp_warray.l_free = NULL;
+   s_mp_warray.l_used = NULL;
+   s_mp_warray.allocated = 0;
+   s_mp_warray.usable = 0;
+}
+
+#endif
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
new file mode 100644
index 000000000..69b2b72dd
--- /dev/null
+++ b/s_mp_warray_get.c
@@ -0,0 +1,33 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_GET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+void *s_mp_warray_get(void)
+{
+   void *ret = NULL;
+   size_t n;
+   S_MP_WARRAY_LOCK();
+   if (s_mp_warray.usable == 0) {
+      if (mp_warray_init(MP_WARRAY_NUM, false, NULL) != MP_OKAY)
+         return NULL;
+   }
+   for (n = 0; n < s_mp_warray.allocated; ++n) {
+      if (s_mp_warray.l_free[n].warray) {
+         s_mp_warray.l_used[n] = s_mp_warray.l_free[n];
+         s_mp_warray.l_free[n].warray = NULL;
+         ret = s_mp_warray.l_used[n].warray;
+         goto LBL_OUT;
+      }
+   }
+   if (s_mp_warray.allocated + 1 > s_mp_warray.usable)
+      goto LBL_OUT;
+   ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
+   s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret;
+
+LBL_OUT:
+   S_MP_WARRAY_UNLOCK();
+   return ret;
+}
+
+#endif
diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c
new file mode 100644
index 000000000..5d84bea86
--- /dev/null
+++ b/s_mp_warray_put.c
@@ -0,0 +1,20 @@
+#include "tommath_private.h"
+#ifdef S_MP_WARRAY_PUT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+void s_mp_warray_put(void *w)
+{
+   size_t n;
+   S_MP_WARRAY_LOCK();
+   for (n = 0; n < s_mp_warray.allocated; ++n) {
+      if (s_mp_warray.l_used[n].warray == w) {
+         s_mp_warray.l_free[n] = s_mp_warray.l_used[n];
+         s_mp_warray.l_used[n].warray = NULL;
+         break;
+      }
+   }
+   S_MP_WARRAY_UNLOCK();
+}
+
+#endif
diff --git a/tommath.def b/tommath.def
index 86f348727..7aa5860f7 100644
--- a/tommath.def
+++ b/tommath.def
@@ -125,6 +125,8 @@ EXPORTS
     mp_to_ubin
     mp_ubin_size
     mp_unpack
+    mp_warray_free
+    mp_warray_init
     mp_xor
     mp_zero
     MP_MUL_KARATSUBA_CUTOFF
diff --git a/tommath.h b/tommath.h
index 84bb0909d..d36753dec 100644
--- a/tommath.h
+++ b/tommath.h
@@ -78,6 +78,25 @@ typedef uint32_t             mp_digit;
 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)MP_DIGIT_BIT))-((mp_digit)1))
 #define MP_DIGIT_MAX     MP_MASK
 
+/* In case the stack size has to be limited, use a WARRAY from the heap */
+#ifdef MP_SMALL_STACK_SIZE
+/* Per default we enable the locking mechanism.
+ * Please disable by defining `MP_NO_LOCKING` if you really know what you do.
+ */
+#ifndef MP_NO_LOCKING
+#define MP_USE_LOCKING
+#endif
+#endif /* MP_SMALL_STACK_SIZE */
+
+/* The user can define how many WARRAY instances are allocated,
+ * usually this should equal the number of parallel threads that
+ * use LTM functionality.
+ * This has no effect if `MP_SMALL_STACK_SIZE` is not defined.
+ */
+#ifndef MP_WARRAY_NUM
+#define MP_WARRAY_NUM 1
+#endif
+
 /* Primality generation flags */
 #define MP_PRIME_BBS      0x0001 /* BBS style prime */
 #define MP_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
@@ -588,6 +607,15 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR;
 mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR;
 #endif
 
+typedef struct {
+   int (*lock)(void *ctx);
+   int (*unlock)(void *ctx);
+   void *ctx;
+} mp_lock;
+
+mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
+int mp_warray_free(void);
+
 #define mp_to_binary(M, S, N)  mp_to_radix((M), (S), (N), NULL, 2)
 #define mp_to_octal(M, S, N)   mp_to_radix((M), (S), (N), NULL, 8)
 #define mp_to_decimal(M, S, N) mp_to_radix((M), (S), (N), NULL, 10)
diff --git a/tommath_class.h b/tommath_class.h
index e08bc5f3c..e1a254fed 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -131,6 +131,8 @@
 #   define MP_TO_UBIN_C
 #   define MP_UBIN_SIZE_C
 #   define MP_UNPACK_C
+#   define MP_WARRAY_FREE_C
+#   define MP_WARRAY_INIT_C
 #   define MP_XOR_C
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
@@ -165,6 +167,10 @@
 #   define S_MP_SQR_KARATSUBA_C
 #   define S_MP_SQR_TOOM_C
 #   define S_MP_SUB_C
+#   define S_MP_WARRAY_C
+#   define S_MP_WARRAY_FREE_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_BUF_C
 #   define S_MP_ZERO_DIGS_C
 #endif
@@ -957,6 +963,15 @@
 #   define MP_ZERO_C
 #endif
 
+#if defined(MP_WARRAY_FREE_C)
+#   define S_MP_WARRAY_FREE_C
+#endif
+
+#if defined(MP_WARRAY_INIT_C)
+#   define S_MP_WARRAY_FREE_C
+#   define S_MP_ZERO_BUF_C
+#endif
+
 #if defined(MP_XOR_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
@@ -1137,6 +1152,8 @@
 #   define MP_CMP_MAG_C
 #   define MP_GROW_C
 #   define S_MP_SUB_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_BUF_C
 #   define S_MP_ZERO_DIGS_C
 #endif
@@ -1165,6 +1182,8 @@
 #if defined(S_MP_MUL_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1179,6 +1198,8 @@
 #if defined(S_MP_MUL_HIGH_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1244,6 +1265,8 @@
 #if defined(S_MP_SQR_COMBA_C)
 #   define MP_CLAMP_C
 #   define MP_GROW_C
+#   define S_MP_WARRAY_GET_C
+#   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_DIGS_C
 #endif
 
@@ -1279,6 +1302,19 @@
 #   define S_MP_ZERO_DIGS_C
 #endif
 
+#if defined(S_MP_WARRAY_C)
+#endif
+
+#if defined(S_MP_WARRAY_FREE_C)
+#endif
+
+#if defined(S_MP_WARRAY_GET_C)
+#   define MP_WARRAY_INIT_C
+#endif
+
+#if defined(S_MP_WARRAY_PUT_C)
+#endif
+
 #if defined(S_MP_ZERO_BUF_C)
 #endif
 
diff --git a/tommath_private.h b/tommath_private.h
index c1fa95a04..6ccf8f0dd 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -234,6 +234,42 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix,
 MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
+#ifdef MP_SMALL_STACK_SIZE
+#define MP_SMALL_STACK_SIZE_C
+#define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get()
+#define MP_FREE_WARRAY(name) s_mp_warray_put(name)
+#define MP_CHECK_WARRAY(name) do { if ((name) == NULL) { return MP_MEM; } } while(0)
+#else
+#define MP_ALLOC_WARRAY(name) name[MP_WARRAY]
+#define MP_FREE_WARRAY(name)
+#define MP_CHECK_WARRAY(name)
+#endif
+
+#ifdef MP_USE_LOCKING
+#define MP_USE_LOCKING_C
+#define S_MP_WARRAY_LOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.lock(s_mp_warray.lock.ctx); } } while(0)
+#define S_MP_WARRAY_UNLOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.unlock(s_mp_warray.lock.ctx); } } while(0)
+#else
+#define S_MP_WARRAY_LOCK()
+#define S_MP_WARRAY_UNLOCK()
+#endif
+
+struct warray {
+   void *warray;
+};
+typedef struct {
+   struct warray *l_free, *l_used;
+   size_t allocated, usable;
+   bool locking_enabled;
+   mp_lock lock;
+} st_warray;
+
+extern MP_PRIVATE st_warray s_mp_warray;
+
+MP_PRIVATE void *s_mp_warray_get(void);
+MP_PRIVATE void s_mp_warray_put(void *w);
+MP_PRIVATE void s_mp_warray_free(size_t n);
+
 #define MP_RADIX_MAP_REVERSE_SIZE 80u
 extern MP_PRIVATE const char s_mp_radix_map[];
 extern MP_PRIVATE const uint8_t s_mp_radix_map_reverse[];
diff --git a/tommath_superclass.h b/tommath_superclass.h
index 9245e0020..10c7f12a2 100644
--- a/tommath_superclass.h
+++ b/tommath_superclass.h
@@ -42,6 +42,8 @@
 #   define MP_SBIN_SIZE_C
 #   define MP_TO_RADIX_C
 #   define MP_TO_SBIN_C
+#   define MP_WARRAY_FREE_C
+#   define MP_WARRAY_INIT_C
 #   define S_MP_RAND_JENKINS_C
 #   define S_MP_RAND_PLATFORM_C
 #endif

From 18e67e1888a801d550b3a9e3e21943fca0d45875 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Thu, 14 Mar 2024 11:42:34 +0100
Subject: [PATCH 02/11] Replace locking by atomic operations

`s_warray_init()` and `s_warray_free()` are not safe and MUST NOT be called
from multiple threads.

This also removes `MP_WARRAY_NUM`, since automatic initialization will not
be safe for more than one thread.

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 .github/workflows/main.yml |  2 --
 demo/test.c                | 23 -----------------------
 doc/bn.tex                 | 33 ++++++++-------------------------
 mp_warray_free.c           |  2 --
 mp_warray_init.c           | 15 +++------------
 s_mp_warray_get.c          | 17 +++++++++--------
 s_mp_warray_put.c          |  8 +++-----
 tommath.h                  | 27 +--------------------------
 tommath_class.h            |  1 -
 tommath_private.h          | 15 ++++-----------
 10 files changed, 28 insertions(+), 115 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6fcd5d466..664cfcaf7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -72,9 +72,7 @@ jobs:
 
           # Build with small stack-size
           - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                                SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
-          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING',       SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
           - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                           SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
-          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',       OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
 
           # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
diff --git a/demo/test.c b/demo/test.c
index 4b1d30f86..c4eb1f8a8 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2451,21 +2451,6 @@ static int test_mp_pack_unpack(void)
    return EXIT_FAILURE;
 }
 
-
-#ifdef MP_TEST_LOCKING
-#ifdef MP_NO_LOCKING
-#error "Can't test locking when locking is disabled"
-#endif
-static mp_lock lock_ctx;
-static int noop_lock_unlock(void *ctx)
-{
-   EXPECT(ctx == &lock_ctx);
-   return 0;
-LBL_ERR:
-   return -1;
-}
-#endif
-
 #ifndef LTM_TEST_DYNAMIC
 #define ONLY_PUBLIC_API_C
 #endif
@@ -2540,14 +2525,6 @@ static int unit_tests(int argc, char **argv)
    unsigned long i, ok, fail, nop;
    uint64_t t;
    int j;
-#ifdef MP_TEST_LOCKING
-   lock_ctx.lock = noop_lock_unlock;
-   lock_ctx.unlock = noop_lock_unlock;
-   lock_ctx.ctx = &lock_ctx;
-
-   if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY)
-      return EXIT_FAILURE;
-#endif
    ok = fail = nop = 0;
 
    t = (uint64_t)time(NULL);
diff --git a/doc/bn.tex b/doc/bn.tex
index 185876335..63e71633b 100644
--- a/doc/bn.tex
+++ b/doc/bn.tex
@@ -357,14 +357,10 @@ \subsection{Small-Stack option}
 The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in
 the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap.
 This comes with one problem, namely: formerly promised thread-safety isn't given anymore.
-Therefore if the Small-Stack option is enabled while doing multi threading, the provided locking
-mechanism shall be used.
-For some use cases it can be desired to use the Small-Stack option, but there are no threads and
-therefore we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}.
+Therefore if the Small-Stack option is enabled while doing multi threading, one shall always initialize
+the library by calling \texttt{mp\_warray\_init()} once with the correct number of threads.
 
-In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can
-be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic
-initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details.
+C.f. \ref{ch:SMALL_STACK_API} for the API description and further details.
 
 \section{Purpose of LibTomMath}
 Unlike	GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath
@@ -443,8 +439,10 @@ \section{Building Programs}
 In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library
 file (typically
 libtommath.a).	There is no library initialization required and the entire library is thread safe
-if it is used in its default configuration. Locking is recommended if the small-stack option
-is enabled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API}
+if it is used in its default configuration. The small-stack option makes use of atomic operations
+to maintain its internal state and therefore does not require locking, but it MUST be initialized
+if used from multiple threads. For further information see \ref{ch:SMALL_STACK_INTRO} resp.
+\ref{ch:SMALL_STACK_API}.
 
 \section{Return Codes}
 There are five possible return codes a function may return.
@@ -839,27 +837,12 @@ \section{Small-Stack option}
 
 \index{mp\_warray\_init}
 \begin{alltt}
-mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
+mp_err mp_warray_init(size_t n_alloc, bool preallocate);
 \end{alltt}
 
 The flag \texttt{preallocate} controls whether the internal buffers --
 \texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when
 \texttt{mp\_warray\_init()} is called, or whether they will be allocated when required.
-The \texttt{mp\_lock} struct looks as follows and shall be used to protect the
-internal structure when using the library in a multi-threaded application.
-
-\index{mp\_lock}
-\begin{alltt}
-typedef struct {
-   int (*lock)(void *ctx);
-   int (*unlock)(void *ctx);
-   void *ctx;
-} mp_lock;
-\end{alltt}
-
-The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp.
-after modifying the internal struct.
-The \texttt{mp\_lock.ctx} element will be passed to those functions.
 
 To free the internally allocated memory the following function shall be called.
 
diff --git a/mp_warray_free.c b/mp_warray_free.c
index 4b01282a0..088efefc4 100644
--- a/mp_warray_free.c
+++ b/mp_warray_free.c
@@ -10,7 +10,6 @@ static int s_warray_free(void)
 {
    int ret = 0;
    size_t n;
-   S_MP_WARRAY_LOCK();
    for (n = 0; n < s_mp_warray.allocated; ++n) {
       if (s_mp_warray.l_used[n].warray) {
          ret = -2;
@@ -23,7 +22,6 @@ static int s_warray_free(void)
    }
    s_mp_warray_free(s_mp_warray.usable);
 ERR_OUT:
-   S_MP_WARRAY_UNLOCK();
    return ret;
 }
 
diff --git a/mp_warray_init.c b/mp_warray_init.c
index 0ff93aa53..c25098861 100644
--- a/mp_warray_init.c
+++ b/mp_warray_init.c
@@ -3,22 +3,13 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
+static mp_err s_warray_init(size_t n_alloc, bool preallocate)
 {
    size_t n;
    if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) {
       return MP_VAL;
    }
 
-   if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) {
-      if (lock->lock == NULL || lock->unlock == NULL)
-         return MP_VAL;
-      s_mp_warray.lock = *lock;
-      s_mp_warray.locking_enabled = true;
-   } else {
-      s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock));
-   }
-
    s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free)));
    s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used)));
    if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) {
@@ -46,9 +37,9 @@ static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
    return MP_OKAY;
 }
 
-mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
+mp_err mp_warray_init(size_t n_alloc, bool preallocate)
 {
-   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock);
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate);
    return MP_ERR;
 }
 
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
index 69b2b72dd..013e83322 100644
--- a/s_mp_warray_get.c
+++ b/s_mp_warray_get.c
@@ -7,26 +7,27 @@ void *s_mp_warray_get(void)
 {
    void *ret = NULL;
    size_t n;
-   S_MP_WARRAY_LOCK();
    if (s_mp_warray.usable == 0) {
-      if (mp_warray_init(MP_WARRAY_NUM, false, NULL) != MP_OKAY)
+      if (mp_warray_init(1, false) != MP_OKAY)
          return NULL;
    }
    for (n = 0; n < s_mp_warray.allocated; ++n) {
-      if (s_mp_warray.l_free[n].warray) {
-         s_mp_warray.l_used[n] = s_mp_warray.l_free[n];
-         s_mp_warray.l_free[n].warray = NULL;
-         ret = s_mp_warray.l_used[n].warray;
+      if (s_mp_warray.l_free[n].warray == NULL)
+         continue;
+      ret = s_mp_warray.l_free[n].warray;
+      if (MP_CMPEXCH(&s_mp_warray.l_free[n].warray, &ret, NULL)) {
+         s_mp_warray.l_used[n].warray = ret;
          goto LBL_OUT;
       }
    }
+   ret = NULL;
    if (s_mp_warray.allocated + 1 > s_mp_warray.usable)
       goto LBL_OUT;
    ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
-   s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret;
+   if (ret != NULL)
+      s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret;
 
 LBL_OUT:
-   S_MP_WARRAY_UNLOCK();
    return ret;
 }
 
diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c
index 5d84bea86..4cf413d62 100644
--- a/s_mp_warray_put.c
+++ b/s_mp_warray_put.c
@@ -5,16 +5,14 @@
 
 void s_mp_warray_put(void *w)
 {
-   size_t n;
-   S_MP_WARRAY_LOCK();
-   for (n = 0; n < s_mp_warray.allocated; ++n) {
+   size_t n, allocated = s_mp_warray.allocated;
+   for (n = 0; n < allocated; ++n) {
       if (s_mp_warray.l_used[n].warray == w) {
-         s_mp_warray.l_free[n] = s_mp_warray.l_used[n];
          s_mp_warray.l_used[n].warray = NULL;
+         s_mp_warray.l_free[n].warray = w;
          break;
       }
    }
-   S_MP_WARRAY_UNLOCK();
 }
 
 #endif
diff --git a/tommath.h b/tommath.h
index d36753dec..7da36d0e9 100644
--- a/tommath.h
+++ b/tommath.h
@@ -78,25 +78,6 @@ typedef uint32_t             mp_digit;
 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)MP_DIGIT_BIT))-((mp_digit)1))
 #define MP_DIGIT_MAX     MP_MASK
 
-/* In case the stack size has to be limited, use a WARRAY from the heap */
-#ifdef MP_SMALL_STACK_SIZE
-/* Per default we enable the locking mechanism.
- * Please disable by defining `MP_NO_LOCKING` if you really know what you do.
- */
-#ifndef MP_NO_LOCKING
-#define MP_USE_LOCKING
-#endif
-#endif /* MP_SMALL_STACK_SIZE */
-
-/* The user can define how many WARRAY instances are allocated,
- * usually this should equal the number of parallel threads that
- * use LTM functionality.
- * This has no effect if `MP_SMALL_STACK_SIZE` is not defined.
- */
-#ifndef MP_WARRAY_NUM
-#define MP_WARRAY_NUM 1
-#endif
-
 /* Primality generation flags */
 #define MP_PRIME_BBS      0x0001 /* BBS style prime */
 #define MP_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
@@ -607,13 +588,7 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR;
 mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR;
 #endif
 
-typedef struct {
-   int (*lock)(void *ctx);
-   int (*unlock)(void *ctx);
-   void *ctx;
-} mp_lock;
-
-mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
+mp_err mp_warray_init(size_t n_alloc, bool preallocate);
 int mp_warray_free(void);
 
 #define mp_to_binary(M, S, N)  mp_to_radix((M), (S), (N), NULL, 2)
diff --git a/tommath_class.h b/tommath_class.h
index e1a254fed..8841413cf 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -969,7 +969,6 @@
 
 #if defined(MP_WARRAY_INIT_C)
 #   define S_MP_WARRAY_FREE_C
-#   define S_MP_ZERO_BUF_C
 #endif
 
 #if defined(MP_XOR_C)
diff --git a/tommath_private.h b/tommath_private.h
index 6ccf8f0dd..46c3afe2e 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -104,6 +104,10 @@ extern void *MP_CALLOC(size_t nmemb, size_t size);
 extern void MP_FREE(void *mem, size_t size);
 #endif
 
+#ifndef MP_CMPEXCH
+#define MP_CMPEXCH(ptr, expected, desired) __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)
+#endif
+
 /* feature detection macro */
 #ifdef _MSC_VER
 /* Prevent false positive: not enough arguments for function-like macro invocation */
@@ -245,23 +249,12 @@ MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 #define MP_CHECK_WARRAY(name)
 #endif
 
-#ifdef MP_USE_LOCKING
-#define MP_USE_LOCKING_C
-#define S_MP_WARRAY_LOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.lock(s_mp_warray.lock.ctx); } } while(0)
-#define S_MP_WARRAY_UNLOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.unlock(s_mp_warray.lock.ctx); } } while(0)
-#else
-#define S_MP_WARRAY_LOCK()
-#define S_MP_WARRAY_UNLOCK()
-#endif
-
 struct warray {
    void *warray;
 };
 typedef struct {
    struct warray *l_free, *l_used;
    size_t allocated, usable;
-   bool locking_enabled;
-   mp_lock lock;
 } st_warray;
 
 extern MP_PRIVATE st_warray s_mp_warray;

From 795f7ba5f5064e8f9bffc0691c5052e9e7a7b140 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Thu, 14 Mar 2024 13:16:01 +0100
Subject: [PATCH 03/11] Add `s_mp_cmpexch_n()`

To be able to support this for MSVC as well, we have to move this into a
separate private API function.

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 s_mp_cmpexch_n.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 s_mp_warray_get.c |  2 +-
 tommath_class.h   |  5 +++++
 tommath_private.h |  6 ++----
 4 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 s_mp_cmpexch_n.c

diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c
new file mode 100644
index 000000000..6334d9d4b
--- /dev/null
+++ b/s_mp_cmpexch_n.c
@@ -0,0 +1,44 @@
+#include "tommath_private.h"
+#ifdef S_MP_CMPEXCH_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+#ifdef __GNUC__
+#define S_CMPEXCH_N_GCC_C
+static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired)
+{
+   return __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
+#endif
+
+#ifdef _MSC_VER
+#define S_CMPEXCH_N_MSVC_C
+
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0501
+#endif
+#ifndef WINVER
+#define WINVER 0x0501
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired)
+{
+   InterlockedCompareExchangePointer(ptr, desired, *(expected));
+   return *ptr == desired;
+}
+#endif
+
+bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired);
+bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired);
+
+bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired)
+{
+   if (MP_HAS(S_CMPEXCH_N_GCC)) return s_cmpexch_n_gcc(ptr, expected, desired);
+   if (MP_HAS(S_CMPEXCH_N_MSVC)) return s_cmpexch_n_msvc(ptr, expected, desired);
+   return false;
+}
+
+#endif
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
index 013e83322..068e8145d 100644
--- a/s_mp_warray_get.c
+++ b/s_mp_warray_get.c
@@ -15,7 +15,7 @@ void *s_mp_warray_get(void)
       if (s_mp_warray.l_free[n].warray == NULL)
          continue;
       ret = s_mp_warray.l_free[n].warray;
-      if (MP_CMPEXCH(&s_mp_warray.l_free[n].warray, &ret, NULL)) {
+      if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) {
          s_mp_warray.l_used[n].warray = ret;
          goto LBL_OUT;
       }
diff --git a/tommath_class.h b/tommath_class.h
index 8841413cf..bb89cc237 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -136,6 +136,7 @@
 #   define MP_XOR_C
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
+#   define S_MP_CMPEXCH_N_C
 #   define S_MP_COPY_DIGS_C
 #   define S_MP_DIV_3_C
 #   define S_MP_DIV_RECURSIVE_C
@@ -986,6 +987,9 @@
 #   define S_MP_ZERO_DIGS_C
 #endif
 
+#if defined(S_MP_CMPEXCH_N_C)
+#endif
+
 #if defined(S_MP_COPY_DIGS_C)
 #endif
 
@@ -1309,6 +1313,7 @@
 
 #if defined(S_MP_WARRAY_GET_C)
 #   define MP_WARRAY_INIT_C
+#   define S_MP_CMPEXCH_N_C
 #endif
 
 #if defined(S_MP_WARRAY_PUT_C)
diff --git a/tommath_private.h b/tommath_private.h
index 46c3afe2e..36291a061 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -104,10 +104,6 @@ extern void *MP_CALLOC(size_t nmemb, size_t size);
 extern void MP_FREE(void *mem, size_t size);
 #endif
 
-#ifndef MP_CMPEXCH
-#define MP_CMPEXCH(ptr, expected, desired) __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)
-#endif
-
 /* feature detection macro */
 #ifdef _MSC_VER
 /* Prevent false positive: not enough arguments for function-like macro invocation */
@@ -238,6 +234,8 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix,
 MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
+MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired);
+
 #ifdef MP_SMALL_STACK_SIZE
 #define MP_SMALL_STACK_SIZE_C
 #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get()

From 98655a8e3c7249ce08a30a0b064c874aa5316c60 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Thu, 14 Mar 2024 13:49:53 +0100
Subject: [PATCH 04/11] Add multi-threaded tests

Output gets garbeled a bit, but we only care for the result which is
`Tests OK/NOP/FAIL: 50/0/0`.

Add `-Wno-incomplete-setjmp-declaration` since `clang-10` shipping with
Ubuntu 20.04 seems broken... and `-Wno-unknown-warning-option` since
`clang-8` doesn't know about this warning...

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 .github/workflows/main.yml |  2 ++
 demo/test.c                | 68 ++++++++++++++++++++++++++++++++------
 makefile_include.mk        |  2 +-
 testme.sh                  | 37 ++++++++++++---------
 4 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 664cfcaf7..fd8e34cf5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -73,6 +73,8 @@ jobs:
           # Build with small stack-size
           - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                                SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
           - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                           SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',                  SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',  OTHERDEPS: 'gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',             SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',  OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
 
           # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
diff --git a/demo/test.c b/demo/test.c
index c4eb1f8a8..306a9c28a 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2455,12 +2455,39 @@ static int test_mp_pack_unpack(void)
 #define ONLY_PUBLIC_API_C
 #endif
 
+#if !defined(LTM_TEST_MULTITHREAD) || !defined(MP_SMALL_STACK_SIZE)
+#define SINGLE_THREADED_C
+typedef unsigned long int pthread_t;
+extern int pthread_create(pthread_t *, const void *, void *(*)(void *), void *);
+extern int pthread_join(pthread_t, void **);
+#else
+#define MULTI_THREADED_C
+#include <pthread.h>
+#endif
+
+struct test_fn {
+   const char *name;
+   int (*fn)(void);
+};
+
+struct thread_info {
+   pthread_t thread_id;
+   const struct test_fn *t;
+   int ret;
+};
+
+static void *run(void *arg)
+{
+   struct thread_info *tinfo = arg;
+
+   tinfo->ret = tinfo->t->fn();
+
+   return arg;
+}
+
 static int unit_tests(int argc, char **argv)
 {
-   static const struct {
-      const char *name;
-      int (*fn)(void);
-   } test[] = {
+   static const struct test_fn test[] = {
 #define T0(n)              { #n, test_##n }
 #define T1(n, o)           { #n, MP_HAS(o) ? test_##n : NULL }
 #define T2(n, o1, o2)      { #n, (MP_HAS(o1) && MP_HAS(o2)) ? test_##n : NULL }
@@ -2522,9 +2549,10 @@ static int unit_tests(int argc, char **argv)
 #undef T2
 #undef T1
    };
+   struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res;
    unsigned long i, ok, fail, nop;
    uint64_t t;
-   int j;
+   int j = -1;
    ok = fail = nop = 0;
 
    t = (uint64_t)time(NULL);
@@ -2532,21 +2560,39 @@ static int unit_tests(int argc, char **argv)
    s_mp_rand_jenkins_init(t);
    mp_rand_source(s_mp_rand_jenkins);
 
+   if (MP_HAS(MULTI_THREADED)) {
+      printf("Multi-threading enabled\n\n");
+      DO(mp_warray_init(sizeof(test) / sizeof(test[0]), 1));
+      /* we ignore the fact that jenkings is not thread safe */
+      for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
+         test_threads[i].t = &test[i];
+         EXPECT(pthread_create(&test_threads[i].thread_id, NULL, run, &test_threads[i]) == 0);
+      }
+   }
 
    for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
-      if (argc > 1) {
-         for (j = 1; j < argc; ++j) {
-            if (strstr(test[i].name, argv[j]) != NULL) {
-               break;
+      if (MP_HAS(SINGLE_THREADED)) {
+         if (argc > 1) {
+            for (j = 1; j < argc; ++j) {
+               if (strstr(test[i].name, argv[j]) != NULL) {
+                  break;
+               }
             }
+            if (j == argc) continue;
          }
-         if (j == argc) continue;
+
+         if (test[i].fn)
+            j = test[i].fn();
+      } else if (MP_HAS(MULTI_THREADED)) {
+         EXPECT(pthread_join(test_threads[i].thread_id, (void **)&res) == 0);
+         j = res->ret;
       }
       printf("TEST %s\n", test[i].name);
+
       if (test[i].fn == NULL) {
          nop++;
          printf("NOP %s\n\n", test[i].name);
-      } else if (test[i].fn() == EXIT_SUCCESS) {
+      } else if (j == EXIT_SUCCESS) {
          ok++;
          printf("\n");
       } else {
diff --git a/makefile_include.mk b/makefile_include.mk
index da897396b..d47ea2ba2 100644
--- a/makefile_include.mk
+++ b/makefile_include.mk
@@ -97,7 +97,7 @@ endif
 endif # COMPILE_SIZE
 
 ifneq ($(findstring clang,$(CC)),)
-LTM_CFLAGS += -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header
+LTM_CFLAGS += -Wno-unknown-warning-option -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header -Wno-incomplete-setjmp-declaration
 ifdef IGNORE_SPEED
 #for dead code eliminiation
 LTM_CFLAGS += -O1
diff --git a/testme.sh b/testme.sh
index 089e42a70..92997a041 100755
--- a/testme.sh
+++ b/testme.sh
@@ -70,6 +70,8 @@ All other options will be tested with all MP_xBIT configurations.
                             runtime and may trigger the 30 minutes
                             timeout.
 
+    --multithread           Run tests in multi-threaded mode (via pthread).
+
 Godmode:
 
     --all                   Choose all architectures and gcc and clang
@@ -128,7 +130,7 @@ _make()
   echo -ne " Compile $1 $2"
   suffix=$(echo ${1}${2}  | tr ' ' '_')
   _fixup_cflags "$1"
-  CC="$1" CFLAGS="$2 $TEST_CFLAGS" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log
+  CC="$1" CFLAGS="$2 $TEST_CFLAGS" LFLAGS="$4" LDFLAGS="$5" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log
   errcnt=$(wc -l < gcc_errors_${suffix}.log)
   if [[ ${errcnt} -gt 1 ]]; then
     echo " failed"
@@ -148,10 +150,10 @@ _runtest()
     # "make tune" will run "tune_it.sh" automatically, hence "autotune", but it cannot
     # get switched off without some effort, so we just let it run twice for testing purposes
     echo -e "\rRun autotune $1 $2"
-    _make "$1" "$2" ""
+    _make "$1" "$2" "" "$3" "$4"
     $_timeout $TUNE_CMD > test_${suffix}.log || _die "running autotune" $?
   else
-    _make "$1" "$2" "test"
+    _make "$1" "$2" "test" "$3" "$4"
     echo -e "\rRun test $1 $2"
     $_timeout ./test > test_${suffix}.log || _die "running tests" $?
   fi
@@ -171,13 +173,13 @@ echo "MAKE_OPTIONS = \"$MAKE_OPTIONS\""
   if [[ "$MAKE_OPTIONS" =~ "tune"  ]]
   then
 echo "autotune branch"
-    _make "$1" "$2" ""
+    _make "$1" "$2" "" "$3" "$4"
     # The shell used for /bin/sh is DASH 0.5.7-4ubuntu1 on the author's machine which fails valgrind, so
     # we just run on instance of etc/tune with the same options as in etc/tune_it.sh
     echo -e "\rRun etc/tune $1 $2 once inside valgrind"
     $_timeout $VALGRIND_BIN $VALGRIND_OPTS $TUNE_CMD > test_${suffix}.log || _die "running etc/tune" $?
   else
-    _make "$1" "$2" "test"
+    _make "$1" "$2" "test" "$3" "$4"
     echo -e "\rRun test $1 $2 inside valgrind"
     $_timeout $VALGRIND_BIN $VALGRIND_OPTS ./test > test_${suffix}.log || _die "running tests" $?
   fi
@@ -301,6 +303,11 @@ do
     --symbols)
       CHECK_SYMBOLS="1"
     ;;
+    --multithread)
+      CFLAGS="$CFLAGS -DLTM_TEST_MULTITHREAD"
+      LFLAGS="$LFLAGS -pthread"
+      LDFLAGS="$LDFLAGS -pthread"
+    ;;
     --all)
       COMPILERS="gcc clang"
       ARCHFLAGS="-m64 -m32 -mx32"
@@ -376,9 +383,9 @@ then
   _banner "$CC"
   if [[ "$VALGRIND_BIN" != "" ]]
   then
-    _runvalgrind "$CC" ""
+    _runvalgrind "$CC" "" "$LFLAGS"  "$LDFLAGS"
   else
-    _runtest "$CC" ""
+    _runtest "$CC" ""  "$LFLAGS"  "$LDFLAGS"
   fi
   _exit
 fi
@@ -398,9 +405,9 @@ _banner
 if [[ "$TEST_VS_MTEST" != "" ]]
 then
    make clean > /dev/null
-   _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent"
+   _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent" "$LFLAGS" "$LDFLAGS"
    echo
-   _make "gcc" "$MTEST_RAND" "mtest"
+   _make "gcc" "$MTEST_RAND" "mtest" "$LFLAGS" "$LDFLAGS"
    echo
    echo "Run test vs. mtest for $TEST_VS_MTEST iterations"
    _timeout=""
@@ -429,15 +436,15 @@ do
     fi
     if [[ "$VALGRIND_BIN" != "" ]]
     then
-      _runvalgrind "$i" "$a $CFLAGS"
+      _runvalgrind "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS"
       [ "$WITH_LOW_MP" != "1" ] && continue
-      _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS"
-      _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS"
+      _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
+      _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
     else
-      _runtest "$i" "$a $CFLAGS"
+      _runtest "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS"
       [ "$WITH_LOW_MP" != "1" ] && continue
-      _runtest "$i" "$a -DMP_16BIT $CFLAGS"
-      _runtest "$i" "$a -DMP_32BIT $CFLAGS"
+      _runtest "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
+      _runtest "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS"
     fi
   done
 done

From fae9aa56457a8cba2f1ca93270ee167d319f7ddc Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Mon, 25 Mar 2024 11:57:42 +0100
Subject: [PATCH 05/11] Add tests for MSVC multi-threading

... and fix some MSVC related (and other) things.

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 appveyor.yml      | 10 +++++-
 demo/test.c       | 86 +++++++++++++++++++++++++++++++++++++++++------
 makefile          |  6 ++--
 makefile.msvc     |  2 +-
 s_mp_cmpexch_n.c  |  3 +-
 s_mp_warray_get.c |  8 +++--
 tommath_c89.h     |  5 +++
 7 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 30d9ee757..5606d9abd 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -18,9 +18,17 @@ build_script:
       if "Visual Studio 2017"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
+      nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE"
+      copy /Y test.exe test_small_stack.exe
+      nmake -f makefile.msvc clean-obj
+      nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD"
+      copy /Y test.exe test_small_stack_multithreaded.exe
+      nmake -f makefile.msvc clean-obj
       nmake -f makefile.msvc test.exe
       nmake -f makefile.msvc clean-obj
-      nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /MD /DLTM_TEST_DYNAMIC"
+      nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /Oi /MD /DLTM_TEST_DYNAMIC"
 test_script:
+- cmd: test_small_stack.exe
+- cmd: test_small_stack_multithreaded.exe
 - cmd: test.exe
 - cmd: test_dll.exe
diff --git a/demo/test.c b/demo/test.c
index 306a9c28a..5ab9686c3 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2455,14 +2455,40 @@ static int test_mp_pack_unpack(void)
 #define ONLY_PUBLIC_API_C
 #endif
 
-#if !defined(LTM_TEST_MULTITHREAD) || !defined(MP_SMALL_STACK_SIZE)
+#if !defined(LTM_TEST_MULTITHREAD)
 #define SINGLE_THREADED_C
-typedef unsigned long int pthread_t;
-extern int pthread_create(pthread_t *, const void *, void *(*)(void *), void *);
-extern int pthread_join(pthread_t, void **);
+typedef uintptr_t thread_id_t;
 #else
 #define MULTI_THREADED_C
+#if !defined(_WIN32)
+#define MULTI_THREADED_PTHREAD_C
 #include <pthread.h>
+typedef pthread_t thread_id_t;
+#else
+#define MULTI_THREADED_MSVC_C
+
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x0501
+#endif
+#ifndef WINVER
+#define WINVER 0x0501
+#endif
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+typedef HANDLE thread_id_t;
+#endif
+#endif
+
+#if !defined(MULTI_THREADED_PTHREAD_C)
+extern int pthread_create(thread_id_t *, const void *, void *(*)(void *), void *);
+extern int pthread_join(thread_id_t, void **);
+#endif
+
+#if !defined(MULTI_THREADED_MSVC_C)
+extern thread_id_t CreateThread(void *, size_t, unsigned long (*)(void *), void *, unsigned long, void *);
+extern unsigned long WaitForSingleObject(thread_id_t hHandle, unsigned long dwMilliseconds);
+#define INFINITE ((unsigned long)-1)
 #endif
 
 struct test_fn {
@@ -2471,12 +2497,12 @@ struct test_fn {
 };
 
 struct thread_info {
-   pthread_t thread_id;
+   thread_id_t thread_id;
    const struct test_fn *t;
    int ret;
 };
 
-static void *run(void *arg)
+static void *run_pthread(void *arg)
 {
    struct thread_info *tinfo = arg;
 
@@ -2485,6 +2511,38 @@ static void *run(void *arg)
    return arg;
 }
 
+static unsigned long run_msvc(void *arg)
+{
+   struct thread_info *tinfo = arg;
+
+   tinfo->ret = tinfo->t->fn();
+
+   return 0;
+}
+
+static int thread_start(struct thread_info *info)
+{
+   if (MP_HAS(MULTI_THREADED_PTHREAD))
+      return pthread_create(&info->thread_id, NULL, run_pthread, info);
+   if (MP_HAS(MULTI_THREADED_MSVC)) {
+      info->thread_id = CreateThread(NULL, 0, run_msvc, info, 0, NULL);
+      return info->thread_id == (thread_id_t)NULL ? -1 : 0;
+   }
+   return -1;
+}
+
+static int thread_join(struct thread_info *info, struct thread_info **res)
+{
+   if (MP_HAS(MULTI_THREADED_PTHREAD))
+      return pthread_join(info->thread_id, (void **)res);
+   if (MP_HAS(MULTI_THREADED_MSVC)) {
+      WaitForSingleObject(info->thread_id, INFINITE);
+      *res = info;
+      return 0;
+   }
+   return -1;
+}
+
 static int unit_tests(int argc, char **argv)
 {
    static const struct test_fn test[] = {
@@ -2551,8 +2609,9 @@ static int unit_tests(int argc, char **argv)
    };
    struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res;
    unsigned long i, ok, fail, nop;
+   size_t n_threads = MP_HAS(MULTI_THREADED) ? sizeof(test) / sizeof(test[0]) : 1;
    uint64_t t;
-   int j = -1;
+   int j;
    ok = fail = nop = 0;
 
    t = (uint64_t)time(NULL);
@@ -2560,17 +2619,22 @@ static int unit_tests(int argc, char **argv)
    s_mp_rand_jenkins_init(t);
    mp_rand_source(s_mp_rand_jenkins);
 
+   if (MP_HAS(MP_SMALL_STACK_SIZE)) {
+      printf("Small-stack enabled with %zu warray buffers\n\n", n_threads);
+      DO(mp_warray_init(n_threads, 1));
+   }
+
    if (MP_HAS(MULTI_THREADED)) {
       printf("Multi-threading enabled\n\n");
-      DO(mp_warray_init(sizeof(test) / sizeof(test[0]), 1));
-      /* we ignore the fact that jenkings is not thread safe */
+      /* we ignore the fact that jenkins is not thread safe */
       for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
          test_threads[i].t = &test[i];
-         EXPECT(pthread_create(&test_threads[i].thread_id, NULL, run, &test_threads[i]) == 0);
+         EXPECT(thread_start(&test_threads[i]) == 0);
       }
    }
 
    for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
+      j = -1;
       if (MP_HAS(SINGLE_THREADED)) {
          if (argc > 1) {
             for (j = 1; j < argc; ++j) {
@@ -2584,7 +2648,7 @@ static int unit_tests(int argc, char **argv)
          if (test[i].fn)
             j = test[i].fn();
       } else if (MP_HAS(MULTI_THREADED)) {
-         EXPECT(pthread_join(test_threads[i].thread_id, (void **)&res) == 0);
+         EXPECT(thread_join(&test_threads[i], &res) == 0);
          j = res->ret;
       }
       printf("TEST %s\n", test[i].name);
diff --git a/makefile b/makefile
index ec32ecd09..437e3d4e9 100644
--- a/makefile
+++ b/makefile
@@ -172,9 +172,10 @@ c89:
 	-e 's/UINT32_MAX/0xFFFFFFFFu/g' \
 	-e 's/UINT64_MAX/(mp_u64)-1/g' \
 	-e 's/INT32_MAX/0x7FFFFFFF/g' \
-        -e 's/INT32_MIN/(-2147483647-1)/g' \
+	-e 's/INT32_MIN/(-2147483647-1)/g' \
 	-e 's/INT64_MAX/(mp_i64)(((mp_u64)1<<63)-1)/g' \
 	-e 's/INT64_MIN/(mp_i64)((mp_u64)1<<63)/g' \
+	-e 's/uintptr_t/mp_uintptr/g' \
 	-e 's/SIZE_MAX/((size_t)-1)/g' \
 	-e 's/\(PRI[ioux]64\)/MP_\1/g' \
 	-e 's/uint\([0-9][0-9]*\)_t/mp_u\1/g' \
@@ -195,10 +196,11 @@ c99:
 	-e 's/false_/MP_NO_/g' \
 	-e 's/0xFFFFFFFFu/UINT32_MAX/g' \
 	-e 's/(mp_u64)-1/UINT64_MAX/g' \
-        -e 's/(-2147483647-1)/INT32_MIN/g' \
+	-e 's/(-2147483647-1)/INT32_MIN/g' \
 	-e 's/0x7FFFFFFF/INT32_MAX/g' \
 	-e 's/(mp_i64)((mp_u64)1<<63)/INT64_MIN/g' \
 	-e 's/(mp_i64)(((mp_u64)1<<63)-1)/INT64_MAX/g' \
+	-e 's/mp_uintptr/uintptr_t/g' \
 	-e 's/((size_t)-1)/SIZE_MAX/g' \
 	-e 's/MP_\(PRI[ioux]64\)/\1/g' \
 	-e 's/mp_u\([0-9][0-9]*\)/uint\1_t/g' \
diff --git a/makefile.msvc b/makefile.msvc
index 5d1285490..da5e2fd62 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -11,7 +11,7 @@
 
 #The following can be overridden from command line e.g. make -f makefile.msvc CC=gcc ARFLAGS=rcs
 PREFIX    = c:\devel
-CFLAGS    = /Ox
+CFLAGS    = /Ox /Oi
 LDFLAGS   =
 
 #Compilation flags
diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c
index 6334d9d4b..e8ef969c2 100644
--- a/s_mp_cmpexch_n.c
+++ b/s_mp_cmpexch_n.c
@@ -26,8 +26,7 @@ static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired)
 
 static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired)
 {
-   InterlockedCompareExchangePointer(ptr, desired, *(expected));
-   return *ptr == desired;
+   return InterlockedCompareExchangePointer(ptr, desired, *(expected));
 }
 #endif
 
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
index 068e8145d..39176eb2c 100644
--- a/s_mp_warray_get.c
+++ b/s_mp_warray_get.c
@@ -11,14 +11,18 @@ void *s_mp_warray_get(void)
       if (mp_warray_init(1, false) != MP_OKAY)
          return NULL;
    }
-   for (n = 0; n < s_mp_warray.allocated; ++n) {
-      if (s_mp_warray.l_free[n].warray == NULL)
+   for (n = 0; n < s_mp_warray.allocated;) {
+      if (s_mp_warray.l_free[n].warray == NULL) {
+         n++;
          continue;
+      }
       ret = s_mp_warray.l_free[n].warray;
       if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) {
          s_mp_warray.l_used[n].warray = ret;
          goto LBL_OUT;
       }
+      /* restart from the beginning if we missed a potential slot */
+      n = 0;
    }
    ret = NULL;
    if (s_mp_warray.allocated + 1 > s_mp_warray.usable)
diff --git a/tommath_c89.h b/tommath_c89.h
index 49400a131..22436366b 100644
--- a/tommath_c89.h
+++ b/tommath_c89.h
@@ -26,6 +26,11 @@ typedef __UINT8_TYPE__  mp_u8;
 typedef __UINT16_TYPE__ mp_u16;
 typedef __UINT32_TYPE__ mp_u32;
 typedef __UINT64_TYPE__ mp_u64;
+# if __WORDSIZE == 64
+typedef __UINT64_TYPE__ mp_uintptr;
+# else
+typedef __UINT32_TYPE__ mp_uintptr;
+# endif
 
 /* inttypes.h replacement, printf format specifier */
 # if __WORDSIZE == 64

From 334465dd1773832d224175fdb252c741e479d32e Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Wed, 27 Mar 2024 14:49:09 +0100
Subject: [PATCH 06/11] Update makefiles

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 libtommath_VS2008.vcproj | 28 ++++++++++++++++++++++++++++
 makefile                 | 15 ++++++++-------
 makefile.mingw           | 15 ++++++++-------
 makefile.msvc            | 15 ++++++++-------
 makefile.shared          | 15 ++++++++-------
 makefile.unix            | 15 ++++++++-------
 sources.cmake            |  7 +++++++
 7 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
index 13158a09d..816217e8d 100644
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@@ -792,6 +792,14 @@
 			RelativePath="mp_unpack.c"
 			>
 		</File>
+		<File
+			RelativePath="mp_warray_free.c"
+			>
+		</File>
+		<File
+			RelativePath="mp_warray_init.c"
+			>
+		</File>
 		<File
 			RelativePath="mp_xor.c"
 			>
@@ -804,6 +812,10 @@
 			RelativePath="s_mp_add.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_cmpexch_n.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_copy_digs.c"
 			>
@@ -928,6 +940,22 @@
 			RelativePath="s_mp_sub.c"
 			>
 		</File>
+		<File
+			RelativePath="s_mp_warray.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_warray_free.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_warray_get.c"
+			>
+		</File>
+		<File
+			RelativePath="s_mp_warray_put.c"
+			>
+		</File>
 		<File
 			RelativePath="s_mp_zero_buf.c"
 			>
diff --git a/makefile b/makefile
index 437e3d4e9..a1729d7f0 100644
--- a/makefile
+++ b/makefile
@@ -43,13 +43,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
+s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
+s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
+s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
+s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
+s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
+s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.mingw b/makefile.mingw
index 532747be0..7597ba6df 100644
--- a/makefile.mingw
+++ b/makefile.mingw
@@ -45,13 +45,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
+s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
+s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
+s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
+s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
+s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
+s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.msvc b/makefile.msvc
index da5e2fd62..e6e8db7fe 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -41,13 +41,14 @@ mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_
 mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \
 mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \
 mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \
-mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj \
-s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj \
-s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj \
-s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \
-s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj \
-s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj \
-s_mp_sqr_toom.obj s_mp_sub.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
+mp_unpack.obj mp_warray_free.obj mp_warray_init.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_cmpexch_n.obj \
+s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj \
+s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj \
+s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj \
+s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj \
+s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj \
+s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_free.obj \
+s_mp_warray_get.obj s_mp_warray_put.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.shared b/makefile.shared
index c9b933513..315252f35 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -40,13 +40,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
+s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
+s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
+s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
+s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
+s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
+s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.unix b/makefile.unix
index 34ebd1a86..c74ec5d7b 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -46,13 +46,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \
-s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \
-s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \
-s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \
-s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \
-s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \
-s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
+s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
+s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
+s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
+s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
+s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
+s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
+s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
 
 
 HEADERS_PUB=tommath.h
diff --git a/sources.cmake b/sources.cmake
index bbb2aeab6..a23dbd451 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -122,9 +122,12 @@ mp_to_sbin.c
 mp_to_ubin.c
 mp_ubin_size.c
 mp_unpack.c
+mp_warray_free.c
+mp_warray_init.c
 mp_xor.c
 mp_zero.c
 s_mp_add.c
+s_mp_cmpexch_n.c
 s_mp_copy_digs.c
 s_mp_div_3.c
 s_mp_div_recursive.c
@@ -156,6 +159,10 @@ s_mp_sqr_comba.c
 s_mp_sqr_karatsuba.c
 s_mp_sqr_toom.c
 s_mp_sub.c
+s_mp_warray.c
+s_mp_warray_free.c
+s_mp_warray_get.c
+s_mp_warray_put.c
 s_mp_zero_buf.c
 s_mp_zero_digs.c
 )

From 33a9d0d5959abda6cca4fcc0fa9cd71aa653088d Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Fri, 29 Mar 2024 10:50:28 +0100
Subject: [PATCH 07/11] Use Thread Local Storage for `warray` buffer

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 demo/test.c        | 20 +++++++++++---------
 mp_warray_free.c   | 18 ++++++------------
 mp_warray_init.c   | 46 ----------------------------------------------
 s_mp_cmpexch_n.c   | 43 -------------------------------------------
 s_mp_warray.c      |  2 +-
 s_mp_warray_free.c | 17 -----------------
 s_mp_warray_get.c  | 34 +++++++---------------------------
 s_mp_warray_put.c  | 12 ++++--------
 tommath.h          |  1 -
 tommath_private.h  | 17 ++++++++---------
 10 files changed, 37 insertions(+), 173 deletions(-)
 delete mode 100644 mp_warray_init.c
 delete mode 100644 s_mp_cmpexch_n.c
 delete mode 100644 s_mp_warray_free.c

diff --git a/demo/test.c b/demo/test.c
index 5ab9686c3..2fa6e08db 100644
--- a/demo/test.c
+++ b/demo/test.c
@@ -2502,20 +2502,24 @@ struct thread_info {
    int ret;
 };
 
-static void *run_pthread(void *arg)
+static void run(struct thread_info *tinfo)
 {
-   struct thread_info *tinfo = arg;
-
    tinfo->ret = tinfo->t->fn();
 
+   if (mp_warray_free() == -2)
+      tinfo->ret = EXIT_FAILURE;
+}
+
+static void *run_pthread(void *arg)
+{
+   run(arg);
+
    return arg;
 }
 
 static unsigned long run_msvc(void *arg)
 {
-   struct thread_info *tinfo = arg;
-
-   tinfo->ret = tinfo->t->fn();
+   run(arg);
 
    return 0;
 }
@@ -2609,7 +2613,6 @@ static int unit_tests(int argc, char **argv)
    };
    struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res;
    unsigned long i, ok, fail, nop;
-   size_t n_threads = MP_HAS(MULTI_THREADED) ? sizeof(test) / sizeof(test[0]) : 1;
    uint64_t t;
    int j;
    ok = fail = nop = 0;
@@ -2620,8 +2623,7 @@ static int unit_tests(int argc, char **argv)
    mp_rand_source(s_mp_rand_jenkins);
 
    if (MP_HAS(MP_SMALL_STACK_SIZE)) {
-      printf("Small-stack enabled with %zu warray buffers\n\n", n_threads);
-      DO(mp_warray_init(n_threads, 1));
+      printf("Small-stack enabled\n\n");
    }
 
    if (MP_HAS(MULTI_THREADED)) {
diff --git a/mp_warray_free.c b/mp_warray_free.c
index 088efefc4..f7470f818 100644
--- a/mp_warray_free.c
+++ b/mp_warray_free.c
@@ -9,19 +9,13 @@ MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY)
 static int s_warray_free(void)
 {
    int ret = 0;
-   size_t n;
-   for (n = 0; n < s_mp_warray.allocated; ++n) {
-      if (s_mp_warray.l_used[n].warray) {
-         ret = -2;
-         goto ERR_OUT;
-      }
+   if (s_mp_warray.w_used)
+      return -2;
+   if (s_mp_warray.w_free) {
+      s_mp_zero_buf(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY);
+      MP_FREE(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY);
+      s_mp_warray.w_free = NULL;
    }
-   for (n = 0; n < s_mp_warray.allocated; ++n) {
-      MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY);
-      s_mp_warray.l_free[n].warray = NULL;
-   }
-   s_mp_warray_free(s_mp_warray.usable);
-ERR_OUT:
    return ret;
 }
 
diff --git a/mp_warray_init.c b/mp_warray_init.c
deleted file mode 100644
index c25098861..000000000
--- a/mp_warray_init.c
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "tommath_private.h"
-#ifdef MP_WARRAY_INIT_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-static mp_err s_warray_init(size_t n_alloc, bool preallocate)
-{
-   size_t n;
-   if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) {
-      return MP_VAL;
-   }
-
-   s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free)));
-   s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used)));
-   if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) {
-      s_mp_warray_free(n_alloc);
-      return MP_MEM;
-   }
-
-   if (preallocate) {
-      for (n = 0; n < n_alloc; ++n) {
-         s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
-         if (s_mp_warray.l_free[n].warray == NULL) {
-            while (n > 0) {
-               n--;
-               MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word));
-               s_mp_warray.l_free[n].warray = NULL;
-            }
-            s_mp_warray_free(n_alloc);
-            return MP_MEM;
-         }
-      }
-      s_mp_warray.allocated = n_alloc;
-   }
-
-   s_mp_warray.usable = n_alloc;
-   return MP_OKAY;
-}
-
-mp_err mp_warray_init(size_t n_alloc, bool preallocate)
-{
-   if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate);
-   return MP_ERR;
-}
-
-#endif
diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c
deleted file mode 100644
index e8ef969c2..000000000
--- a/s_mp_cmpexch_n.c
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "tommath_private.h"
-#ifdef S_MP_CMPEXCH_N_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-#ifdef __GNUC__
-#define S_CMPEXCH_N_GCC_C
-static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired)
-{
-   return __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
-}
-#endif
-
-#ifdef _MSC_VER
-#define S_CMPEXCH_N_MSVC_C
-
-#ifndef _WIN32_WINNT
-#define _WIN32_WINNT 0x0501
-#endif
-#ifndef WINVER
-#define WINVER 0x0501
-#endif
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-
-static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired)
-{
-   return InterlockedCompareExchangePointer(ptr, desired, *(expected));
-}
-#endif
-
-bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired);
-bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired);
-
-bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired)
-{
-   if (MP_HAS(S_CMPEXCH_N_GCC)) return s_cmpexch_n_gcc(ptr, expected, desired);
-   if (MP_HAS(S_CMPEXCH_N_MSVC)) return s_cmpexch_n_msvc(ptr, expected, desired);
-   return false;
-}
-
-#endif
diff --git a/s_mp_warray.c b/s_mp_warray.c
index d181057cb..1b8b068b7 100644
--- a/s_mp_warray.c
+++ b/s_mp_warray.c
@@ -3,6 +3,6 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis */
 /* SPDX-License-Identifier: Unlicense */
 
-st_warray s_mp_warray;
+mp_thread st_warray s_mp_warray = { 0 };
 
 #endif
diff --git a/s_mp_warray_free.c b/s_mp_warray_free.c
deleted file mode 100644
index 9d8b75eb1..000000000
--- a/s_mp_warray_free.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "tommath_private.h"
-#ifdef S_MP_WARRAY_FREE_C
-/* LibTomMath, multiple-precision integer library -- Tom St Denis */
-/* SPDX-License-Identifier: Unlicense */
-
-void s_mp_warray_free(size_t n)
-{
-   (void)n;
-   MP_FREE(s_mp_warray.l_free, n * sizeof(*(s_mp_warray.l_free)));
-   MP_FREE(s_mp_warray.l_used, n * sizeof(*(s_mp_warray.l_used)));
-   s_mp_warray.l_free = NULL;
-   s_mp_warray.l_used = NULL;
-   s_mp_warray.allocated = 0;
-   s_mp_warray.usable = 0;
-}
-
-#endif
diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c
index 39176eb2c..26b0d7c10 100644
--- a/s_mp_warray_get.c
+++ b/s_mp_warray_get.c
@@ -5,34 +5,14 @@
 
 void *s_mp_warray_get(void)
 {
-   void *ret = NULL;
-   size_t n;
-   if (s_mp_warray.usable == 0) {
-      if (mp_warray_init(1, false) != MP_OKAY)
-         return NULL;
+   if (s_mp_warray.w_used)
+      return NULL;
+   if (s_mp_warray.w_free == NULL) {
+      s_mp_warray.w_free = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
    }
-   for (n = 0; n < s_mp_warray.allocated;) {
-      if (s_mp_warray.l_free[n].warray == NULL) {
-         n++;
-         continue;
-      }
-      ret = s_mp_warray.l_free[n].warray;
-      if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) {
-         s_mp_warray.l_used[n].warray = ret;
-         goto LBL_OUT;
-      }
-      /* restart from the beginning if we missed a potential slot */
-      n = 0;
-   }
-   ret = NULL;
-   if (s_mp_warray.allocated + 1 > s_mp_warray.usable)
-      goto LBL_OUT;
-   ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
-   if (ret != NULL)
-      s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret;
-
-LBL_OUT:
-   return ret;
+   s_mp_warray.w_used = s_mp_warray.w_free;
+   s_mp_warray.w_free = NULL;
+   return s_mp_warray.w_used;
 }
 
 #endif
diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c
index 4cf413d62..79e014acd 100644
--- a/s_mp_warray_put.c
+++ b/s_mp_warray_put.c
@@ -5,14 +5,10 @@
 
 void s_mp_warray_put(void *w)
 {
-   size_t n, allocated = s_mp_warray.allocated;
-   for (n = 0; n < allocated; ++n) {
-      if (s_mp_warray.l_used[n].warray == w) {
-         s_mp_warray.l_used[n].warray = NULL;
-         s_mp_warray.l_free[n].warray = w;
-         break;
-      }
-   }
+   if (s_mp_warray.w_free || s_mp_warray.w_used != w)
+      return;
+   s_mp_warray.w_free = w;
+   s_mp_warray.w_used = NULL;
 }
 
 #endif
diff --git a/tommath.h b/tommath.h
index 7da36d0e9..1820d2436 100644
--- a/tommath.h
+++ b/tommath.h
@@ -588,7 +588,6 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR;
 mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR;
 #endif
 
-mp_err mp_warray_init(size_t n_alloc, bool preallocate);
 int mp_warray_free(void);
 
 #define mp_to_binary(M, S, N)  mp_to_radix((M), (S), (N), NULL, 2)
diff --git a/tommath_private.h b/tommath_private.h
index 36291a061..9c25f330f 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -234,8 +234,6 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix,
 MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
-MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired);
-
 #ifdef MP_SMALL_STACK_SIZE
 #define MP_SMALL_STACK_SIZE_C
 #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get()
@@ -247,19 +245,20 @@ MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired);
 #define MP_CHECK_WARRAY(name)
 #endif
 
-struct warray {
-   void *warray;
-};
+#if defined(_MSC_VER)
+#define mp_thread __declspec(thread)
+#elif defined(__GNUC__)
+#define mp_thread __thread
+#endif
+
 typedef struct {
-   struct warray *l_free, *l_used;
-   size_t allocated, usable;
+   void *w_free, *w_used;
 } st_warray;
 
-extern MP_PRIVATE st_warray s_mp_warray;
+extern MP_PRIVATE mp_thread st_warray s_mp_warray;
 
 MP_PRIVATE void *s_mp_warray_get(void);
 MP_PRIVATE void s_mp_warray_put(void *w);
-MP_PRIVATE void s_mp_warray_free(size_t n);
 
 #define MP_RADIX_MAP_REVERSE_SIZE 80u
 extern MP_PRIVATE const char s_mp_radix_map[];

From 513d48d404c77f6886297828e33c4002541aac10 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Fri, 29 Mar 2024 10:50:45 +0100
Subject: [PATCH 08/11] regen

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 libtommath_VS2008.vcproj | 12 ------------
 makefile                 | 16 ++++++++--------
 makefile.mingw           | 16 ++++++++--------
 makefile.msvc            | 16 ++++++++--------
 makefile.shared          | 16 ++++++++--------
 makefile.unix            | 16 ++++++++--------
 sources.cmake            |  3 ---
 tommath.def              |  1 -
 tommath_class.h          | 17 +----------------
 9 files changed, 41 insertions(+), 72 deletions(-)

diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj
index 816217e8d..71dd3807f 100644
--- a/libtommath_VS2008.vcproj
+++ b/libtommath_VS2008.vcproj
@@ -796,10 +796,6 @@
 			RelativePath="mp_warray_free.c"
 			>
 		</File>
-		<File
-			RelativePath="mp_warray_init.c"
-			>
-		</File>
 		<File
 			RelativePath="mp_xor.c"
 			>
@@ -812,10 +808,6 @@
 			RelativePath="s_mp_add.c"
 			>
 		</File>
-		<File
-			RelativePath="s_mp_cmpexch_n.c"
-			>
-		</File>
 		<File
 			RelativePath="s_mp_copy_digs.c"
 			>
@@ -944,10 +936,6 @@
 			RelativePath="s_mp_warray.c"
 			>
 		</File>
-		<File
-			RelativePath="s_mp_warray_free.c"
-			>
-		</File>
 		<File
 			RelativePath="s_mp_warray_get.c"
 			>
diff --git a/makefile b/makefile
index a1729d7f0..8f211f5f2 100644
--- a/makefile
+++ b/makefile
@@ -43,14 +43,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
-s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
-s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
-s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
-s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
-s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
-s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.mingw b/makefile.mingw
index 7597ba6df..e2445e8a0 100644
--- a/makefile.mingw
+++ b/makefile.mingw
@@ -45,14 +45,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
-s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
-s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
-s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
-s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
-s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
-s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.msvc b/makefile.msvc
index e6e8db7fe..8540ca33d 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -41,14 +41,14 @@ mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_
 mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \
 mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \
 mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \
-mp_unpack.obj mp_warray_free.obj mp_warray_init.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_cmpexch_n.obj \
-s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj \
-s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj \
-s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj \
-s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj \
-s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj \
-s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_free.obj \
-s_mp_warray_get.obj s_mp_warray_put.obj s_mp_zero_buf.obj s_mp_zero_digs.obj
+mp_unpack.obj mp_warray_free.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \
+s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj \
+s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj \
+s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj \
+s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj \
+s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj \
+s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_get.obj s_mp_warray_put.obj \
+s_mp_zero_buf.obj s_mp_zero_digs.obj
 
 HEADERS_PUB=tommath.h
 HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB)
diff --git a/makefile.shared b/makefile.shared
index 315252f35..50c335269 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -40,14 +40,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
-s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
-s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
-s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
-s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
-s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
-s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 #END_INS
 
diff --git a/makefile.unix b/makefile.unix
index c74ec5d7b..58642098d 100644
--- a/makefile.unix
+++ b/makefile.unix
@@ -46,14 +46,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m
 mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \
 mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \
 mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \
-mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \
-s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \
-s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \
-s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \
-s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \
-s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \
-s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \
-s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o
+mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \
+s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \
+s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \
+s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \
+s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \
+s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \
+s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \
+s_mp_zero_buf.o s_mp_zero_digs.o
 
 
 HEADERS_PUB=tommath.h
diff --git a/sources.cmake b/sources.cmake
index a23dbd451..103e9c099 100644
--- a/sources.cmake
+++ b/sources.cmake
@@ -123,11 +123,9 @@ mp_to_ubin.c
 mp_ubin_size.c
 mp_unpack.c
 mp_warray_free.c
-mp_warray_init.c
 mp_xor.c
 mp_zero.c
 s_mp_add.c
-s_mp_cmpexch_n.c
 s_mp_copy_digs.c
 s_mp_div_3.c
 s_mp_div_recursive.c
@@ -160,7 +158,6 @@ s_mp_sqr_karatsuba.c
 s_mp_sqr_toom.c
 s_mp_sub.c
 s_mp_warray.c
-s_mp_warray_free.c
 s_mp_warray_get.c
 s_mp_warray_put.c
 s_mp_zero_buf.c
diff --git a/tommath.def b/tommath.def
index 7aa5860f7..ed5aa8b0c 100644
--- a/tommath.def
+++ b/tommath.def
@@ -126,7 +126,6 @@ EXPORTS
     mp_ubin_size
     mp_unpack
     mp_warray_free
-    mp_warray_init
     mp_xor
     mp_zero
     MP_MUL_KARATSUBA_CUTOFF
diff --git a/tommath_class.h b/tommath_class.h
index bb89cc237..09bb3ea63 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -132,11 +132,9 @@
 #   define MP_UBIN_SIZE_C
 #   define MP_UNPACK_C
 #   define MP_WARRAY_FREE_C
-#   define MP_WARRAY_INIT_C
 #   define MP_XOR_C
 #   define MP_ZERO_C
 #   define S_MP_ADD_C
-#   define S_MP_CMPEXCH_N_C
 #   define S_MP_COPY_DIGS_C
 #   define S_MP_DIV_3_C
 #   define S_MP_DIV_RECURSIVE_C
@@ -169,7 +167,6 @@
 #   define S_MP_SQR_TOOM_C
 #   define S_MP_SUB_C
 #   define S_MP_WARRAY_C
-#   define S_MP_WARRAY_FREE_C
 #   define S_MP_WARRAY_GET_C
 #   define S_MP_WARRAY_PUT_C
 #   define S_MP_ZERO_BUF_C
@@ -965,11 +962,7 @@
 #endif
 
 #if defined(MP_WARRAY_FREE_C)
-#   define S_MP_WARRAY_FREE_C
-#endif
-
-#if defined(MP_WARRAY_INIT_C)
-#   define S_MP_WARRAY_FREE_C
+#   define S_MP_ZERO_BUF_C
 #endif
 
 #if defined(MP_XOR_C)
@@ -987,9 +980,6 @@
 #   define S_MP_ZERO_DIGS_C
 #endif
 
-#if defined(S_MP_CMPEXCH_N_C)
-#endif
-
 #if defined(S_MP_COPY_DIGS_C)
 #endif
 
@@ -1308,12 +1298,7 @@
 #if defined(S_MP_WARRAY_C)
 #endif
 
-#if defined(S_MP_WARRAY_FREE_C)
-#endif
-
 #if defined(S_MP_WARRAY_GET_C)
-#   define MP_WARRAY_INIT_C
-#   define S_MP_CMPEXCH_N_C
 #endif
 
 #if defined(S_MP_WARRAY_PUT_C)

From 4c2e17793856bb3f5280a379079144b3f3fc2af5 Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Fri, 29 Mar 2024 11:10:19 +0100
Subject: [PATCH 09/11] Use appveyor build matrix

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 appveyor.yml | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 5606d9abd..b2e2d3907 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -11,6 +11,16 @@ image:
 - Visual Studio 2019
 - Visual Studio 2017
 - Visual Studio 2015
+environment:
+  matrix:
+  - CFLAGS_VAR: ""
+    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC\""
+
+  - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE\""
+    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE\""
+
+  - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\""
+    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\""
 build_script:
 - cmd: >-
     if "Visual Studio 2022"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
@@ -18,17 +28,9 @@ build_script:
       if "Visual Studio 2017"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64
       if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64
-      nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE"
-      copy /Y test.exe test_small_stack.exe
+      nmake -f makefile.msvc test.exe %CFLAGS_VAR%
       nmake -f makefile.msvc clean-obj
-      nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD"
-      copy /Y test.exe test_small_stack_multithreaded.exe
-      nmake -f makefile.msvc clean-obj
-      nmake -f makefile.msvc test.exe
-      nmake -f makefile.msvc clean-obj
-      nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /Oi /MD /DLTM_TEST_DYNAMIC"
+      nmake -f makefile.msvc test_dll.exe %CFLAGS_VAR_DLL%
 test_script:
-- cmd: test_small_stack.exe
-- cmd: test_small_stack_multithreaded.exe
 - cmd: test.exe
 - cmd: test_dll.exe

From 6c70ef1cec6fe26f4e9d316f72b0b2134268634b Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Fri, 29 Mar 2024 12:15:32 +0100
Subject: [PATCH 10/11] Run small-stack tests with valgrind

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index fd8e34cf5..18a832bbe 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -72,9 +72,9 @@ jobs:
 
           # Build with small stack-size
           - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                                SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'gcc-multilib' }
-          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE',                           SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
-          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',                  SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',  OTHERDEPS: 'gcc-multilib' }
-          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',             SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',  OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --limit-valgrind',          SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 libc6-dev-i386 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread --limit-valgrind', SANITIZER: '',  COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'libc6-dev-i386 gcc-multilib' }
+          - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread',             SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '',        OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
 
           # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
           #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'

From 3570e12884d54e1503fda03569517530c01f5ddd Mon Sep 17 00:00:00 2001
From: Steffen Jaeckel <s@jaeckel.eu>
Date: Wed, 3 Apr 2024 15:48:23 +0200
Subject: [PATCH 11/11] Disable MP_SMALL_STACK_SIZE on MSVC

Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
---
 appveyor.yml      |  8 +-------
 makefile.msvc     |  2 +-
 tommath_private.h | 23 +++++++++++++++++++----
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index b2e2d3907..2134f2ddf 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -14,13 +14,7 @@ image:
 environment:
   matrix:
   - CFLAGS_VAR: ""
-    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC\""
-
-  - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE\""
-    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE\""
-
-  - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\""
-    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\""
+    CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /MD /DLTM_TEST_DYNAMIC\""
 build_script:
 - cmd: >-
     if "Visual Studio 2022"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
diff --git a/makefile.msvc b/makefile.msvc
index 8540ca33d..8feb425c4 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -11,7 +11,7 @@
 
 #The following can be overridden from command line e.g. make -f makefile.msvc CC=gcc ARFLAGS=rcs
 PREFIX    = c:\devel
-CFLAGS    = /Ox /Oi
+CFLAGS    = /Ox
 LDFLAGS   =
 
 #Compilation flags
diff --git a/tommath_private.h b/tommath_private.h
index 9c25f330f..be620dbc9 100644
--- a/tommath_private.h
+++ b/tommath_private.h
@@ -235,6 +235,23 @@ MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR;
 MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 
 #ifdef MP_SMALL_STACK_SIZE
+
+#if defined(__GNUC__)
+/* We use TLS (Thread Local Storage) to manage the instance of the WARRAY
+ * per thread.
+ * The compilers we're usually looking at are GCC, Clang and MSVC.
+ * Both GCC and Clang are straight-forward with TLS, so it's enabled there.
+ * Using MSVC the tests were OK with the static library, but failed when
+ * the library was built as a DLL. As a result we completely disable
+ * support for MSVC.
+ * If your compiler can handle TLS properly without too much hocus pocus,
+ * feel free to open a PR to add support for it.
+ */
+#define mp_thread __thread
+#else
+#error "MP_SMALL_STACK_SIZE not supported with your compiler"
+#endif
+
 #define MP_SMALL_STACK_SIZE_C
 #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get()
 #define MP_FREE_WARRAY(name) s_mp_warray_put(name)
@@ -245,10 +262,8 @@ MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR;
 #define MP_CHECK_WARRAY(name)
 #endif
 
-#if defined(_MSC_VER)
-#define mp_thread __declspec(thread)
-#elif defined(__GNUC__)
-#define mp_thread __thread
+#ifndef mp_thread
+#define mp_thread
 #endif
 
 typedef struct {