From bd66fccee4bddfb9334fd419444665abeaea6034 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Tue, 12 Apr 2022 15:03:47 +0200 Subject: [PATCH 01/11] add `MP_SMALL_STACK_SIZE` option This adds an option to use a heap-buffer for the usually stack-based `MP_WARRAY`-sized temporary buffers. Per default it will reserve a single buffer, which can be modified * at compile-time via the `MP_WARRAY_NUM` define * at run-time by calling `mp_warray_init()` The internal structure can only be created once. If one wants to modify the maximum number of elements, the entire structure has to be free'd by calling `mp_warray_free()`. In case one wants to use this option with multiple threads, one shall use the `mp_warray_init()` function and pass appropriate locking functions. Signed-off-by: Steffen Jaeckel --- .github/workflows/main.yml | 6 ++++ CMakeLists.txt | 2 +- demo/test.c | 31 ++++++++++++++-- doc/bn.tex | 64 +++++++++++++++++++++++++++++++++- helper.pl | 2 +- mp_warray_free.c | 36 +++++++++++++++++++ mp_warray_init.c | 55 +++++++++++++++++++++++++++++ s_mp_montgomery_reduce_comba.c | 7 +++- s_mp_mul_comba.c | 7 +++- s_mp_mul_high_comba.c | 7 +++- s_mp_sqr_comba.c | 6 +++- s_mp_warray.c | 8 +++++ s_mp_warray_free.c | 17 +++++++++ s_mp_warray_get.c | 33 ++++++++++++++++++ s_mp_warray_put.c | 20 +++++++++++ tommath.def | 2 ++ tommath.h | 28 +++++++++++++++ tommath_class.h | 36 +++++++++++++++++++ tommath_private.h | 36 +++++++++++++++++++ tommath_superclass.h | 2 ++ 20 files changed, 396 insertions(+), 9 deletions(-) create mode 100644 mp_warray_free.c create mode 100644 mp_warray_init.c create mode 100644 s_mp_warray.c create mode 100644 s_mp_warray_free.c create mode 100644 s_mp_warray_get.c create mode 100644 s_mp_warray_put.c diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 24f881f3d..6fcd5d466 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -70,6 +70,12 @@ jobs: # RSA superclass with tests (no sanitizer, but debug info) - { BUILDOPTIONS: '--with-cc=gcc --with-m64 --cflags=-DLTM_NOTHING --cflags=-DSC_RSA_1_WITH_TESTS --limit-valgrind', SANITIZER: '', COMPILE_DEBUG: '1', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: '' } + # Build with small stack-size + - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } + # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs. #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune' #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --limit-valgrind --make-option=tune' diff --git a/CMakeLists.txt b/CMakeLists.txt index d60632777..2f59d32e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,7 +138,7 @@ if(COMPILE_LTO) if(COMPILER_SUPPORTS_LTO) set_property(TARGET ${PROJECT_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) else() - message(SEND_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.") + message(FATAL_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.") endif() endif() diff --git a/demo/test.c b/demo/test.c index f290dbf21..4b1d30f86 100644 --- a/demo/test.c +++ b/demo/test.c @@ -2451,6 +2451,21 @@ static int test_mp_pack_unpack(void) return EXIT_FAILURE; } + +#ifdef MP_TEST_LOCKING +#ifdef MP_NO_LOCKING +#error "Can't test locking when locking is disabled" +#endif +static mp_lock lock_ctx; +static int noop_lock_unlock(void *ctx) +{ + EXPECT(ctx == &lock_ctx); + return 0; +LBL_ERR: + return -1; +} +#endif + #ifndef LTM_TEST_DYNAMIC #define ONLY_PUBLIC_API_C #endif @@ -2525,7 +2540,14 @@ static int unit_tests(int argc, char **argv) unsigned long i, ok, fail, nop; uint64_t t; int j; +#ifdef MP_TEST_LOCKING + lock_ctx.lock = noop_lock_unlock; + lock_ctx.unlock = noop_lock_unlock; + lock_ctx.ctx = &lock_ctx; + if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY) + return EXIT_FAILURE; +#endif ok = fail = nop = 0; t = (uint64_t)time(NULL); @@ -2533,6 +2555,7 @@ static int unit_tests(int argc, char **argv) s_mp_rand_jenkins_init(t); mp_rand_source(s_mp_rand_jenkins); + for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) { if (argc > 1) { for (j = 1; j < argc; ++j) { @@ -2556,8 +2579,12 @@ static int unit_tests(int argc, char **argv) } fprintf(fail?stderr:stdout, "Tests OK/NOP/FAIL: %lu/%lu/%lu\n", ok, nop, fail); - if (fail != 0) return EXIT_FAILURE; - else return EXIT_SUCCESS; + EXPECT(mp_warray_free() != -2); + + if (fail == 0) + return EXIT_SUCCESS; +LBL_ERR: + return EXIT_FAILURE; } int main(int argc, char **argv) diff --git a/doc/bn.tex b/doc/bn.tex index 22ae5f3eb..185876335 100644 --- a/doc/bn.tex +++ b/doc/bn.tex @@ -352,6 +352,20 @@ \subsubsection{Operand Size Related} \end{center} \end{small} +\subsection{Small-Stack option} +\label{ch:SMALL_STACK_INTRO} +The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in +the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap. +This comes with one problem, namely: formerly promised thread-safety isn't given anymore. +Therefore if the Small-Stack option is enabled while doing multi threading, the provided locking +mechanism shall be used. +For some use cases it can be desired to use the Small-Stack option, but there are no threads and +therefore we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}. + +In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can +be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic +initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details. + \section{Purpose of LibTomMath} Unlike GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath was not written with bleeding edge performance in mind. First and foremost LibTomMath was written @@ -428,7 +442,9 @@ \chapter{Getting Started with LibTomMath} \section{Building Programs} In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically -libtommath.a). There is no library initialization required and the entire library is thread safe. +libtommath.a). There is no library initialization required and the entire library is thread safe +if it is used in its default configuration. Locking is recommended if the small-stack option +is enabled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API} \section{Return Codes} There are five possible return codes a function may return. @@ -813,6 +829,52 @@ \subsection{Adding additional digits} \end{alltt} \end{small} +\section{Small-Stack option} +\label{ch:SMALL_STACK_API} + +In case the \texttt{MP\_SMALL\_STACK\_SIZE} symbol is defined the following functions +can be useful. + +To initialize the internal structure the following function shall be called. + +\index{mp\_warray\_init} +\begin{alltt} +mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock); +\end{alltt} + +The flag \texttt{preallocate} controls whether the internal buffers -- +\texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when +\texttt{mp\_warray\_init()} is called, or whether they will be allocated when required. +The \texttt{mp\_lock} struct looks as follows and shall be used to protect the +internal structure when using the library in a multi-threaded application. + +\index{mp\_lock} +\begin{alltt} +typedef struct { + int (*lock)(void *ctx); + int (*unlock)(void *ctx); + void *ctx; +} mp_lock; +\end{alltt} + +The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp. +after modifying the internal struct. +The \texttt{mp\_lock.ctx} element will be passed to those functions. + +To free the internally allocated memory the following function shall be called. + +\index{mp\_warray\_free} +\begin{alltt} +int mp_warray_free(void); +\end{alltt} + + +Those two API functions are always available, even if the \texttt{MP\_SMALL\_STACK\_SIZE} option +has been disabled at compile time. +In that case \texttt{mp\_warray\_init()} will return \texttt{MP\_ERR} and \texttt{mp\_warray\_free()} +will return $-1$. + + \chapter{Basic Operations} \section{Copying} diff --git a/helper.pl b/helper.pl index 53658614c..ffc592a7c 100755 --- a/helper.pl +++ b/helper.pl @@ -394,7 +394,7 @@ sub update_dep foreach my $filename (glob '*mp_*.c') { my $content; my $cc = $ENV{'CC'} || 'gcc'; - $content = `$cc -E -x c -DLTM_ALL $filename`; + $content = `$cc -E -x c -DLTM_ALL -DMP_SMALL_STACK_SIZE $filename`; $content =~ s/^# 1 "$filename".*?^# 2 "$filename"//ms; # convert filename to upper case so we can use it as a define diff --git a/mp_warray_free.c b/mp_warray_free.c new file mode 100644 index 000000000..4b01282a0 --- /dev/null +++ b/mp_warray_free.c @@ -0,0 +1,36 @@ +#include "tommath_private.h" +#ifdef MP_WARRAY_FREE_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +/* static check that the multiplication won't overflow */ +MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) >= MP_WARRAY) + +static int s_warray_free(void) +{ + int ret = 0; + size_t n; + S_MP_WARRAY_LOCK(); + for (n = 0; n < s_mp_warray.allocated; ++n) { + if (s_mp_warray.l_used[n].warray) { + ret = -2; + goto ERR_OUT; + } + } + for (n = 0; n < s_mp_warray.allocated; ++n) { + MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY); + s_mp_warray.l_free[n].warray = NULL; + } + s_mp_warray_free(s_mp_warray.usable); +ERR_OUT: + S_MP_WARRAY_UNLOCK(); + return ret; +} + +int mp_warray_free(void) +{ + if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_free(); + return -1; +} + +#endif diff --git a/mp_warray_init.c b/mp_warray_init.c new file mode 100644 index 000000000..0ff93aa53 --- /dev/null +++ b/mp_warray_init.c @@ -0,0 +1,55 @@ +#include "tommath_private.h" +#ifdef MP_WARRAY_INIT_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock) +{ + size_t n; + if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) { + return MP_VAL; + } + + if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) { + if (lock->lock == NULL || lock->unlock == NULL) + return MP_VAL; + s_mp_warray.lock = *lock; + s_mp_warray.locking_enabled = true; + } else { + s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock)); + } + + s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free))); + s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used))); + if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) { + s_mp_warray_free(n_alloc); + return MP_MEM; + } + + if (preallocate) { + for (n = 0; n < n_alloc; ++n) { + s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); + if (s_mp_warray.l_free[n].warray == NULL) { + while (n > 0) { + n--; + MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word)); + s_mp_warray.l_free[n].warray = NULL; + } + s_mp_warray_free(n_alloc); + return MP_MEM; + } + } + s_mp_warray.allocated = n_alloc; + } + + s_mp_warray.usable = n_alloc; + return MP_OKAY; +} + +mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock) +{ + if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock); + return MP_ERR; +} + +#endif diff --git a/s_mp_montgomery_reduce_comba.c b/s_mp_montgomery_reduce_comba.c index 7472caf34..3858f75a0 100644 --- a/s_mp_montgomery_reduce_comba.c +++ b/s_mp_montgomery_reduce_comba.c @@ -15,9 +15,12 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) { int ix, oldused; mp_err err; - mp_word W[MP_WARRAY]; + mp_word MP_ALLOC_WARRAY(W); + + MP_CHECK_WARRAY(W); if (x->used > MP_WARRAY) { + MP_FREE_WARRAY(W); return MP_VAL; } @@ -26,6 +29,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) /* grow a as required */ if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) { + MP_FREE_WARRAY(W); return err; } @@ -110,6 +114,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho) mp_clamp(x); + MP_FREE_WARRAY(W); /* if A >= m then A = A - m */ if (mp_cmp_mag(x, n) != MP_LT) { return s_mp_sub(x, n, x); diff --git a/s_mp_mul_comba.c b/s_mp_mul_comba.c index ca89ff9dd..5b37035ea 100644 --- a/s_mp_mul_comba.c +++ b/s_mp_mul_comba.c @@ -23,15 +23,19 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) { int oldused, pa, ix; mp_err err; - mp_digit W[MP_WARRAY]; + mp_digit MP_ALLOC_WARRAY(W); mp_word _W; + MP_CHECK_WARRAY(W); + if (digs < 0) { + MP_FREE_WARRAY(W); return MP_VAL; } /* grow the destination as required */ if ((err = mp_grow(c, digs)) != MP_OKAY) { + MP_FREE_WARRAY(W); return err; } @@ -77,6 +81,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs) s_mp_zero_digs(c->dp + c->used, oldused - c->used); mp_clamp(c); + MP_FREE_WARRAY(W); return MP_OKAY; } #endif diff --git a/s_mp_mul_high_comba.c b/s_mp_mul_high_comba.c index b5ac06d74..b0096d4e6 100644 --- a/s_mp_mul_high_comba.c +++ b/s_mp_mul_high_comba.c @@ -16,16 +16,20 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs { int oldused, pa, ix; mp_err err; - mp_digit W[MP_WARRAY]; + mp_digit MP_ALLOC_WARRAY(W); mp_word _W; + MP_CHECK_WARRAY(W); + if (digs < 0) { + MP_FREE_WARRAY(W); return MP_VAL; } /* grow the destination as required */ pa = a->used + b->used; if ((err = mp_grow(c, pa)) != MP_OKAY) { + MP_FREE_WARRAY(W); return err; } @@ -69,6 +73,7 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs s_mp_zero_digs(c->dp + c->used, oldused - c->used); mp_clamp(c); + MP_FREE_WARRAY(W); return MP_OKAY; } #endif diff --git a/s_mp_sqr_comba.c b/s_mp_sqr_comba.c index 1bcc1f93f..336a0a082 100644 --- a/s_mp_sqr_comba.c +++ b/s_mp_sqr_comba.c @@ -16,13 +16,16 @@ After that loop you do the squares and add them in. mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b) { int oldused, pa, ix; - mp_digit W[MP_WARRAY]; + mp_digit MP_ALLOC_WARRAY(W); mp_word W1; mp_err err; + MP_CHECK_WARRAY(W); + /* grow the destination as required */ pa = a->used + a->used; if ((err = mp_grow(b, pa)) != MP_OKAY) { + MP_FREE_WARRAY(W); return err; } @@ -82,6 +85,7 @@ mp_err s_mp_sqr_comba(const mp_int *a, mp_int *b) s_mp_zero_digs(b->dp + b->used, oldused - b->used); mp_clamp(b); + MP_FREE_WARRAY(W); return MP_OKAY; } #endif diff --git a/s_mp_warray.c b/s_mp_warray.c new file mode 100644 index 000000000..d181057cb --- /dev/null +++ b/s_mp_warray.c @@ -0,0 +1,8 @@ +#include "tommath_private.h" +#ifdef S_MP_WARRAY_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +st_warray s_mp_warray; + +#endif diff --git a/s_mp_warray_free.c b/s_mp_warray_free.c new file mode 100644 index 000000000..9d8b75eb1 --- /dev/null +++ b/s_mp_warray_free.c @@ -0,0 +1,17 @@ +#include "tommath_private.h" +#ifdef S_MP_WARRAY_FREE_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +void s_mp_warray_free(size_t n) +{ + (void)n; + MP_FREE(s_mp_warray.l_free, n * sizeof(*(s_mp_warray.l_free))); + MP_FREE(s_mp_warray.l_used, n * sizeof(*(s_mp_warray.l_used))); + s_mp_warray.l_free = NULL; + s_mp_warray.l_used = NULL; + s_mp_warray.allocated = 0; + s_mp_warray.usable = 0; +} + +#endif diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c new file mode 100644 index 000000000..69b2b72dd --- /dev/null +++ b/s_mp_warray_get.c @@ -0,0 +1,33 @@ +#include "tommath_private.h" +#ifdef S_MP_WARRAY_GET_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +void *s_mp_warray_get(void) +{ + void *ret = NULL; + size_t n; + S_MP_WARRAY_LOCK(); + if (s_mp_warray.usable == 0) { + if (mp_warray_init(MP_WARRAY_NUM, false, NULL) != MP_OKAY) + return NULL; + } + for (n = 0; n < s_mp_warray.allocated; ++n) { + if (s_mp_warray.l_free[n].warray) { + s_mp_warray.l_used[n] = s_mp_warray.l_free[n]; + s_mp_warray.l_free[n].warray = NULL; + ret = s_mp_warray.l_used[n].warray; + goto LBL_OUT; + } + } + if (s_mp_warray.allocated + 1 > s_mp_warray.usable) + goto LBL_OUT; + ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); + s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret; + +LBL_OUT: + S_MP_WARRAY_UNLOCK(); + return ret; +} + +#endif diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c new file mode 100644 index 000000000..5d84bea86 --- /dev/null +++ b/s_mp_warray_put.c @@ -0,0 +1,20 @@ +#include "tommath_private.h" +#ifdef S_MP_WARRAY_PUT_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +void s_mp_warray_put(void *w) +{ + size_t n; + S_MP_WARRAY_LOCK(); + for (n = 0; n < s_mp_warray.allocated; ++n) { + if (s_mp_warray.l_used[n].warray == w) { + s_mp_warray.l_free[n] = s_mp_warray.l_used[n]; + s_mp_warray.l_used[n].warray = NULL; + break; + } + } + S_MP_WARRAY_UNLOCK(); +} + +#endif diff --git a/tommath.def b/tommath.def index 86f348727..7aa5860f7 100644 --- a/tommath.def +++ b/tommath.def @@ -125,6 +125,8 @@ EXPORTS mp_to_ubin mp_ubin_size mp_unpack + mp_warray_free + mp_warray_init mp_xor mp_zero MP_MUL_KARATSUBA_CUTOFF diff --git a/tommath.h b/tommath.h index 84bb0909d..d36753dec 100644 --- a/tommath.h +++ b/tommath.h @@ -78,6 +78,25 @@ typedef uint32_t mp_digit; #define MP_MASK ((((mp_digit)1)<<((mp_digit)MP_DIGIT_BIT))-((mp_digit)1)) #define MP_DIGIT_MAX MP_MASK +/* In case the stack size has to be limited, use a WARRAY from the heap */ +#ifdef MP_SMALL_STACK_SIZE +/* Per default we enable the locking mechanism. + * Please disable by defining `MP_NO_LOCKING` if you really know what you do. + */ +#ifndef MP_NO_LOCKING +#define MP_USE_LOCKING +#endif +#endif /* MP_SMALL_STACK_SIZE */ + +/* The user can define how many WARRAY instances are allocated, + * usually this should equal the number of parallel threads that + * use LTM functionality. + * This has no effect if `MP_SMALL_STACK_SIZE` is not defined. + */ +#ifndef MP_WARRAY_NUM +#define MP_WARRAY_NUM 1 +#endif + /* Primality generation flags */ #define MP_PRIME_BBS 0x0001 /* BBS style prime */ #define MP_PRIME_SAFE 0x0002 /* Safe prime (p-1)/2 == prime */ @@ -588,6 +607,15 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR; mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR; #endif +typedef struct { + int (*lock)(void *ctx); + int (*unlock)(void *ctx); + void *ctx; +} mp_lock; + +mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock); +int mp_warray_free(void); + #define mp_to_binary(M, S, N) mp_to_radix((M), (S), (N), NULL, 2) #define mp_to_octal(M, S, N) mp_to_radix((M), (S), (N), NULL, 8) #define mp_to_decimal(M, S, N) mp_to_radix((M), (S), (N), NULL, 10) diff --git a/tommath_class.h b/tommath_class.h index e08bc5f3c..e1a254fed 100644 --- a/tommath_class.h +++ b/tommath_class.h @@ -131,6 +131,8 @@ # define MP_TO_UBIN_C # define MP_UBIN_SIZE_C # define MP_UNPACK_C +# define MP_WARRAY_FREE_C +# define MP_WARRAY_INIT_C # define MP_XOR_C # define MP_ZERO_C # define S_MP_ADD_C @@ -165,6 +167,10 @@ # define S_MP_SQR_KARATSUBA_C # define S_MP_SQR_TOOM_C # define S_MP_SUB_C +# define S_MP_WARRAY_C +# define S_MP_WARRAY_FREE_C +# define S_MP_WARRAY_GET_C +# define S_MP_WARRAY_PUT_C # define S_MP_ZERO_BUF_C # define S_MP_ZERO_DIGS_C #endif @@ -957,6 +963,15 @@ # define MP_ZERO_C #endif +#if defined(MP_WARRAY_FREE_C) +# define S_MP_WARRAY_FREE_C +#endif + +#if defined(MP_WARRAY_INIT_C) +# define S_MP_WARRAY_FREE_C +# define S_MP_ZERO_BUF_C +#endif + #if defined(MP_XOR_C) # define MP_CLAMP_C # define MP_GROW_C @@ -1137,6 +1152,8 @@ # define MP_CMP_MAG_C # define MP_GROW_C # define S_MP_SUB_C +# define S_MP_WARRAY_GET_C +# define S_MP_WARRAY_PUT_C # define S_MP_ZERO_BUF_C # define S_MP_ZERO_DIGS_C #endif @@ -1165,6 +1182,8 @@ #if defined(S_MP_MUL_COMBA_C) # define MP_CLAMP_C # define MP_GROW_C +# define S_MP_WARRAY_GET_C +# define S_MP_WARRAY_PUT_C # define S_MP_ZERO_DIGS_C #endif @@ -1179,6 +1198,8 @@ #if defined(S_MP_MUL_HIGH_COMBA_C) # define MP_CLAMP_C # define MP_GROW_C +# define S_MP_WARRAY_GET_C +# define S_MP_WARRAY_PUT_C # define S_MP_ZERO_DIGS_C #endif @@ -1244,6 +1265,8 @@ #if defined(S_MP_SQR_COMBA_C) # define MP_CLAMP_C # define MP_GROW_C +# define S_MP_WARRAY_GET_C +# define S_MP_WARRAY_PUT_C # define S_MP_ZERO_DIGS_C #endif @@ -1279,6 +1302,19 @@ # define S_MP_ZERO_DIGS_C #endif +#if defined(S_MP_WARRAY_C) +#endif + +#if defined(S_MP_WARRAY_FREE_C) +#endif + +#if defined(S_MP_WARRAY_GET_C) +# define MP_WARRAY_INIT_C +#endif + +#if defined(S_MP_WARRAY_PUT_C) +#endif + #if defined(S_MP_ZERO_BUF_C) #endif diff --git a/tommath_private.h b/tommath_private.h index c1fa95a04..6ccf8f0dd 100644 --- a/tommath_private.h +++ b/tommath_private.h @@ -234,6 +234,42 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix, MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR; MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; +#ifdef MP_SMALL_STACK_SIZE +#define MP_SMALL_STACK_SIZE_C +#define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get() +#define MP_FREE_WARRAY(name) s_mp_warray_put(name) +#define MP_CHECK_WARRAY(name) do { if ((name) == NULL) { return MP_MEM; } } while(0) +#else +#define MP_ALLOC_WARRAY(name) name[MP_WARRAY] +#define MP_FREE_WARRAY(name) +#define MP_CHECK_WARRAY(name) +#endif + +#ifdef MP_USE_LOCKING +#define MP_USE_LOCKING_C +#define S_MP_WARRAY_LOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.lock(s_mp_warray.lock.ctx); } } while(0) +#define S_MP_WARRAY_UNLOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.unlock(s_mp_warray.lock.ctx); } } while(0) +#else +#define S_MP_WARRAY_LOCK() +#define S_MP_WARRAY_UNLOCK() +#endif + +struct warray { + void *warray; +}; +typedef struct { + struct warray *l_free, *l_used; + size_t allocated, usable; + bool locking_enabled; + mp_lock lock; +} st_warray; + +extern MP_PRIVATE st_warray s_mp_warray; + +MP_PRIVATE void *s_mp_warray_get(void); +MP_PRIVATE void s_mp_warray_put(void *w); +MP_PRIVATE void s_mp_warray_free(size_t n); + #define MP_RADIX_MAP_REVERSE_SIZE 80u extern MP_PRIVATE const char s_mp_radix_map[]; extern MP_PRIVATE const uint8_t s_mp_radix_map_reverse[]; diff --git a/tommath_superclass.h b/tommath_superclass.h index 9245e0020..10c7f12a2 100644 --- a/tommath_superclass.h +++ b/tommath_superclass.h @@ -42,6 +42,8 @@ # define MP_SBIN_SIZE_C # define MP_TO_RADIX_C # define MP_TO_SBIN_C +# define MP_WARRAY_FREE_C +# define MP_WARRAY_INIT_C # define S_MP_RAND_JENKINS_C # define S_MP_RAND_PLATFORM_C #endif From 18e67e1888a801d550b3a9e3e21943fca0d45875 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Thu, 14 Mar 2024 11:42:34 +0100 Subject: [PATCH 02/11] Replace locking by atomic operations `s_warray_init()` and `s_warray_free()` are not safe and MUST NOT be called from multiple threads. This also removes `MP_WARRAY_NUM`, since automatic initialization will not be safe for more than one thread. Signed-off-by: Steffen Jaeckel --- .github/workflows/main.yml | 2 -- demo/test.c | 23 ----------------------- doc/bn.tex | 33 ++++++++------------------------- mp_warray_free.c | 2 -- mp_warray_init.c | 15 +++------------ s_mp_warray_get.c | 17 +++++++++-------- s_mp_warray_put.c | 8 +++----- tommath.h | 27 +-------------------------- tommath_class.h | 1 - tommath_private.h | 15 ++++----------- 10 files changed, 28 insertions(+), 115 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6fcd5d466..664cfcaf7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,9 +72,7 @@ jobs: # Build with small stack-size - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } - - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } - - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs. #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune' diff --git a/demo/test.c b/demo/test.c index 4b1d30f86..c4eb1f8a8 100644 --- a/demo/test.c +++ b/demo/test.c @@ -2451,21 +2451,6 @@ static int test_mp_pack_unpack(void) return EXIT_FAILURE; } - -#ifdef MP_TEST_LOCKING -#ifdef MP_NO_LOCKING -#error "Can't test locking when locking is disabled" -#endif -static mp_lock lock_ctx; -static int noop_lock_unlock(void *ctx) -{ - EXPECT(ctx == &lock_ctx); - return 0; -LBL_ERR: - return -1; -} -#endif - #ifndef LTM_TEST_DYNAMIC #define ONLY_PUBLIC_API_C #endif @@ -2540,14 +2525,6 @@ static int unit_tests(int argc, char **argv) unsigned long i, ok, fail, nop; uint64_t t; int j; -#ifdef MP_TEST_LOCKING - lock_ctx.lock = noop_lock_unlock; - lock_ctx.unlock = noop_lock_unlock; - lock_ctx.ctx = &lock_ctx; - - if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY) - return EXIT_FAILURE; -#endif ok = fail = nop = 0; t = (uint64_t)time(NULL); diff --git a/doc/bn.tex b/doc/bn.tex index 185876335..63e71633b 100644 --- a/doc/bn.tex +++ b/doc/bn.tex @@ -357,14 +357,10 @@ \subsection{Small-Stack option} The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap. This comes with one problem, namely: formerly promised thread-safety isn't given anymore. -Therefore if the Small-Stack option is enabled while doing multi threading, the provided locking -mechanism shall be used. -For some use cases it can be desired to use the Small-Stack option, but there are no threads and -therefore we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}. +Therefore if the Small-Stack option is enabled while doing multi threading, one shall always initialize +the library by calling \texttt{mp\_warray\_init()} once with the correct number of threads. -In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can -be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic -initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details. +C.f. \ref{ch:SMALL_STACK_API} for the API description and further details. \section{Purpose of LibTomMath} Unlike GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath @@ -443,8 +439,10 @@ \section{Building Programs} In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically libtommath.a). There is no library initialization required and the entire library is thread safe -if it is used in its default configuration. Locking is recommended if the small-stack option -is enabled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API} +if it is used in its default configuration. The small-stack option makes use of atomic operations +to maintain its internal state and therefore does not require locking, but it MUST be initialized +if used from multiple threads. For further information see \ref{ch:SMALL_STACK_INTRO} resp. +\ref{ch:SMALL_STACK_API}. \section{Return Codes} There are five possible return codes a function may return. @@ -839,27 +837,12 @@ \section{Small-Stack option} \index{mp\_warray\_init} \begin{alltt} -mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock); +mp_err mp_warray_init(size_t n_alloc, bool preallocate); \end{alltt} The flag \texttt{preallocate} controls whether the internal buffers -- \texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when \texttt{mp\_warray\_init()} is called, or whether they will be allocated when required. -The \texttt{mp\_lock} struct looks as follows and shall be used to protect the -internal structure when using the library in a multi-threaded application. - -\index{mp\_lock} -\begin{alltt} -typedef struct { - int (*lock)(void *ctx); - int (*unlock)(void *ctx); - void *ctx; -} mp_lock; -\end{alltt} - -The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp. -after modifying the internal struct. -The \texttt{mp\_lock.ctx} element will be passed to those functions. To free the internally allocated memory the following function shall be called. diff --git a/mp_warray_free.c b/mp_warray_free.c index 4b01282a0..088efefc4 100644 --- a/mp_warray_free.c +++ b/mp_warray_free.c @@ -10,7 +10,6 @@ static int s_warray_free(void) { int ret = 0; size_t n; - S_MP_WARRAY_LOCK(); for (n = 0; n < s_mp_warray.allocated; ++n) { if (s_mp_warray.l_used[n].warray) { ret = -2; @@ -23,7 +22,6 @@ static int s_warray_free(void) } s_mp_warray_free(s_mp_warray.usable); ERR_OUT: - S_MP_WARRAY_UNLOCK(); return ret; } diff --git a/mp_warray_init.c b/mp_warray_init.c index 0ff93aa53..c25098861 100644 --- a/mp_warray_init.c +++ b/mp_warray_init.c @@ -3,22 +3,13 @@ /* LibTomMath, multiple-precision integer library -- Tom St Denis */ /* SPDX-License-Identifier: Unlicense */ -static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock) +static mp_err s_warray_init(size_t n_alloc, bool preallocate) { size_t n; if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) { return MP_VAL; } - if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) { - if (lock->lock == NULL || lock->unlock == NULL) - return MP_VAL; - s_mp_warray.lock = *lock; - s_mp_warray.locking_enabled = true; - } else { - s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock)); - } - s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free))); s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used))); if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) { @@ -46,9 +37,9 @@ static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock) return MP_OKAY; } -mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock) +mp_err mp_warray_init(size_t n_alloc, bool preallocate) { - if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock); + if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate); return MP_ERR; } diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c index 69b2b72dd..013e83322 100644 --- a/s_mp_warray_get.c +++ b/s_mp_warray_get.c @@ -7,26 +7,27 @@ void *s_mp_warray_get(void) { void *ret = NULL; size_t n; - S_MP_WARRAY_LOCK(); if (s_mp_warray.usable == 0) { - if (mp_warray_init(MP_WARRAY_NUM, false, NULL) != MP_OKAY) + if (mp_warray_init(1, false) != MP_OKAY) return NULL; } for (n = 0; n < s_mp_warray.allocated; ++n) { - if (s_mp_warray.l_free[n].warray) { - s_mp_warray.l_used[n] = s_mp_warray.l_free[n]; - s_mp_warray.l_free[n].warray = NULL; - ret = s_mp_warray.l_used[n].warray; + if (s_mp_warray.l_free[n].warray == NULL) + continue; + ret = s_mp_warray.l_free[n].warray; + if (MP_CMPEXCH(&s_mp_warray.l_free[n].warray, &ret, NULL)) { + s_mp_warray.l_used[n].warray = ret; goto LBL_OUT; } } + ret = NULL; if (s_mp_warray.allocated + 1 > s_mp_warray.usable) goto LBL_OUT; ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); - s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret; + if (ret != NULL) + s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret; LBL_OUT: - S_MP_WARRAY_UNLOCK(); return ret; } diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c index 5d84bea86..4cf413d62 100644 --- a/s_mp_warray_put.c +++ b/s_mp_warray_put.c @@ -5,16 +5,14 @@ void s_mp_warray_put(void *w) { - size_t n; - S_MP_WARRAY_LOCK(); - for (n = 0; n < s_mp_warray.allocated; ++n) { + size_t n, allocated = s_mp_warray.allocated; + for (n = 0; n < allocated; ++n) { if (s_mp_warray.l_used[n].warray == w) { - s_mp_warray.l_free[n] = s_mp_warray.l_used[n]; s_mp_warray.l_used[n].warray = NULL; + s_mp_warray.l_free[n].warray = w; break; } } - S_MP_WARRAY_UNLOCK(); } #endif diff --git a/tommath.h b/tommath.h index d36753dec..7da36d0e9 100644 --- a/tommath.h +++ b/tommath.h @@ -78,25 +78,6 @@ typedef uint32_t mp_digit; #define MP_MASK ((((mp_digit)1)<<((mp_digit)MP_DIGIT_BIT))-((mp_digit)1)) #define MP_DIGIT_MAX MP_MASK -/* In case the stack size has to be limited, use a WARRAY from the heap */ -#ifdef MP_SMALL_STACK_SIZE -/* Per default we enable the locking mechanism. - * Please disable by defining `MP_NO_LOCKING` if you really know what you do. - */ -#ifndef MP_NO_LOCKING -#define MP_USE_LOCKING -#endif -#endif /* MP_SMALL_STACK_SIZE */ - -/* The user can define how many WARRAY instances are allocated, - * usually this should equal the number of parallel threads that - * use LTM functionality. - * This has no effect if `MP_SMALL_STACK_SIZE` is not defined. - */ -#ifndef MP_WARRAY_NUM -#define MP_WARRAY_NUM 1 -#endif - /* Primality generation flags */ #define MP_PRIME_BBS 0x0001 /* BBS style prime */ #define MP_PRIME_SAFE 0x0002 /* Safe prime (p-1)/2 == prime */ @@ -607,13 +588,7 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR; mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR; #endif -typedef struct { - int (*lock)(void *ctx); - int (*unlock)(void *ctx); - void *ctx; -} mp_lock; - -mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock); +mp_err mp_warray_init(size_t n_alloc, bool preallocate); int mp_warray_free(void); #define mp_to_binary(M, S, N) mp_to_radix((M), (S), (N), NULL, 2) diff --git a/tommath_class.h b/tommath_class.h index e1a254fed..8841413cf 100644 --- a/tommath_class.h +++ b/tommath_class.h @@ -969,7 +969,6 @@ #if defined(MP_WARRAY_INIT_C) # define S_MP_WARRAY_FREE_C -# define S_MP_ZERO_BUF_C #endif #if defined(MP_XOR_C) diff --git a/tommath_private.h b/tommath_private.h index 6ccf8f0dd..46c3afe2e 100644 --- a/tommath_private.h +++ b/tommath_private.h @@ -104,6 +104,10 @@ extern void *MP_CALLOC(size_t nmemb, size_t size); extern void MP_FREE(void *mem, size_t size); #endif +#ifndef MP_CMPEXCH +#define MP_CMPEXCH(ptr, expected, desired) __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE) +#endif + /* feature detection macro */ #ifdef _MSC_VER /* Prevent false positive: not enough arguments for function-like macro invocation */ @@ -245,23 +249,12 @@ MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; #define MP_CHECK_WARRAY(name) #endif -#ifdef MP_USE_LOCKING -#define MP_USE_LOCKING_C -#define S_MP_WARRAY_LOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.lock(s_mp_warray.lock.ctx); } } while(0) -#define S_MP_WARRAY_UNLOCK() do { if (s_mp_warray.locking_enabled) { s_mp_warray.lock.unlock(s_mp_warray.lock.ctx); } } while(0) -#else -#define S_MP_WARRAY_LOCK() -#define S_MP_WARRAY_UNLOCK() -#endif - struct warray { void *warray; }; typedef struct { struct warray *l_free, *l_used; size_t allocated, usable; - bool locking_enabled; - mp_lock lock; } st_warray; extern MP_PRIVATE st_warray s_mp_warray; From 795f7ba5f5064e8f9bffc0691c5052e9e7a7b140 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Thu, 14 Mar 2024 13:16:01 +0100 Subject: [PATCH 03/11] Add `s_mp_cmpexch_n()` To be able to support this for MSVC as well, we have to move this into a separate private API function. Signed-off-by: Steffen Jaeckel --- s_mp_cmpexch_n.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ s_mp_warray_get.c | 2 +- tommath_class.h | 5 +++++ tommath_private.h | 6 ++---- 4 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 s_mp_cmpexch_n.c diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c new file mode 100644 index 000000000..6334d9d4b --- /dev/null +++ b/s_mp_cmpexch_n.c @@ -0,0 +1,44 @@ +#include "tommath_private.h" +#ifdef S_MP_CMPEXCH_N_C +/* LibTomMath, multiple-precision integer library -- Tom St Denis */ +/* SPDX-License-Identifier: Unlicense */ + +#ifdef __GNUC__ +#define S_CMPEXCH_N_GCC_C +static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired) +{ + return __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE); +} +#endif + +#ifdef _MSC_VER +#define S_CMPEXCH_N_MSVC_C + +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0501 +#endif +#ifndef WINVER +#define WINVER 0x0501 +#endif + +#define WIN32_LEAN_AND_MEAN +#include + +static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired) +{ + InterlockedCompareExchangePointer(ptr, desired, *(expected)); + return *ptr == desired; +} +#endif + +bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired); +bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired); + +bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired) +{ + if (MP_HAS(S_CMPEXCH_N_GCC)) return s_cmpexch_n_gcc(ptr, expected, desired); + if (MP_HAS(S_CMPEXCH_N_MSVC)) return s_cmpexch_n_msvc(ptr, expected, desired); + return false; +} + +#endif diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c index 013e83322..068e8145d 100644 --- a/s_mp_warray_get.c +++ b/s_mp_warray_get.c @@ -15,7 +15,7 @@ void *s_mp_warray_get(void) if (s_mp_warray.l_free[n].warray == NULL) continue; ret = s_mp_warray.l_free[n].warray; - if (MP_CMPEXCH(&s_mp_warray.l_free[n].warray, &ret, NULL)) { + if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) { s_mp_warray.l_used[n].warray = ret; goto LBL_OUT; } diff --git a/tommath_class.h b/tommath_class.h index 8841413cf..bb89cc237 100644 --- a/tommath_class.h +++ b/tommath_class.h @@ -136,6 +136,7 @@ # define MP_XOR_C # define MP_ZERO_C # define S_MP_ADD_C +# define S_MP_CMPEXCH_N_C # define S_MP_COPY_DIGS_C # define S_MP_DIV_3_C # define S_MP_DIV_RECURSIVE_C @@ -986,6 +987,9 @@ # define S_MP_ZERO_DIGS_C #endif +#if defined(S_MP_CMPEXCH_N_C) +#endif + #if defined(S_MP_COPY_DIGS_C) #endif @@ -1309,6 +1313,7 @@ #if defined(S_MP_WARRAY_GET_C) # define MP_WARRAY_INIT_C +# define S_MP_CMPEXCH_N_C #endif #if defined(S_MP_WARRAY_PUT_C) diff --git a/tommath_private.h b/tommath_private.h index 46c3afe2e..36291a061 100644 --- a/tommath_private.h +++ b/tommath_private.h @@ -104,10 +104,6 @@ extern void *MP_CALLOC(size_t nmemb, size_t size); extern void MP_FREE(void *mem, size_t size); #endif -#ifndef MP_CMPEXCH -#define MP_CMPEXCH(ptr, expected, desired) __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE) -#endif - /* feature detection macro */ #ifdef _MSC_VER /* Prevent false positive: not enough arguments for function-like macro invocation */ @@ -238,6 +234,8 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix, MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR; MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; +MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired); + #ifdef MP_SMALL_STACK_SIZE #define MP_SMALL_STACK_SIZE_C #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get() From 98655a8e3c7249ce08a30a0b064c874aa5316c60 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Thu, 14 Mar 2024 13:49:53 +0100 Subject: [PATCH 04/11] Add multi-threaded tests Output gets garbeled a bit, but we only care for the result which is `Tests OK/NOP/FAIL: 50/0/0`. Add `-Wno-incomplete-setjmp-declaration` since `clang-10` shipping with Ubuntu 20.04 seems broken... and `-Wno-unknown-warning-option` since `clang-8` doesn't know about this warning... Signed-off-by: Steffen Jaeckel --- .github/workflows/main.yml | 2 ++ demo/test.c | 68 ++++++++++++++++++++++++++++++++------ makefile_include.mk | 2 +- testme.sh | 37 ++++++++++++--------- 4 files changed, 82 insertions(+), 27 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 664cfcaf7..fd8e34cf5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -73,6 +73,8 @@ jobs: # Build with small stack-size - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs. #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune' diff --git a/demo/test.c b/demo/test.c index c4eb1f8a8..306a9c28a 100644 --- a/demo/test.c +++ b/demo/test.c @@ -2455,12 +2455,39 @@ static int test_mp_pack_unpack(void) #define ONLY_PUBLIC_API_C #endif +#if !defined(LTM_TEST_MULTITHREAD) || !defined(MP_SMALL_STACK_SIZE) +#define SINGLE_THREADED_C +typedef unsigned long int pthread_t; +extern int pthread_create(pthread_t *, const void *, void *(*)(void *), void *); +extern int pthread_join(pthread_t, void **); +#else +#define MULTI_THREADED_C +#include +#endif + +struct test_fn { + const char *name; + int (*fn)(void); +}; + +struct thread_info { + pthread_t thread_id; + const struct test_fn *t; + int ret; +}; + +static void *run(void *arg) +{ + struct thread_info *tinfo = arg; + + tinfo->ret = tinfo->t->fn(); + + return arg; +} + static int unit_tests(int argc, char **argv) { - static const struct { - const char *name; - int (*fn)(void); - } test[] = { + static const struct test_fn test[] = { #define T0(n) { #n, test_##n } #define T1(n, o) { #n, MP_HAS(o) ? test_##n : NULL } #define T2(n, o1, o2) { #n, (MP_HAS(o1) && MP_HAS(o2)) ? test_##n : NULL } @@ -2522,9 +2549,10 @@ static int unit_tests(int argc, char **argv) #undef T2 #undef T1 }; + struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res; unsigned long i, ok, fail, nop; uint64_t t; - int j; + int j = -1; ok = fail = nop = 0; t = (uint64_t)time(NULL); @@ -2532,21 +2560,39 @@ static int unit_tests(int argc, char **argv) s_mp_rand_jenkins_init(t); mp_rand_source(s_mp_rand_jenkins); + if (MP_HAS(MULTI_THREADED)) { + printf("Multi-threading enabled\n\n"); + DO(mp_warray_init(sizeof(test) / sizeof(test[0]), 1)); + /* we ignore the fact that jenkings is not thread safe */ + for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) { + test_threads[i].t = &test[i]; + EXPECT(pthread_create(&test_threads[i].thread_id, NULL, run, &test_threads[i]) == 0); + } + } for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) { - if (argc > 1) { - for (j = 1; j < argc; ++j) { - if (strstr(test[i].name, argv[j]) != NULL) { - break; + if (MP_HAS(SINGLE_THREADED)) { + if (argc > 1) { + for (j = 1; j < argc; ++j) { + if (strstr(test[i].name, argv[j]) != NULL) { + break; + } } + if (j == argc) continue; } - if (j == argc) continue; + + if (test[i].fn) + j = test[i].fn(); + } else if (MP_HAS(MULTI_THREADED)) { + EXPECT(pthread_join(test_threads[i].thread_id, (void **)&res) == 0); + j = res->ret; } printf("TEST %s\n", test[i].name); + if (test[i].fn == NULL) { nop++; printf("NOP %s\n\n", test[i].name); - } else if (test[i].fn() == EXIT_SUCCESS) { + } else if (j == EXIT_SUCCESS) { ok++; printf("\n"); } else { diff --git a/makefile_include.mk b/makefile_include.mk index da897396b..d47ea2ba2 100644 --- a/makefile_include.mk +++ b/makefile_include.mk @@ -97,7 +97,7 @@ endif endif # COMPILE_SIZE ifneq ($(findstring clang,$(CC)),) -LTM_CFLAGS += -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header +LTM_CFLAGS += -Wno-unknown-warning-option -Wno-typedef-redefinition -Wno-tautological-compare -Wno-builtin-requires-header -Wno-incomplete-setjmp-declaration ifdef IGNORE_SPEED #for dead code eliminiation LTM_CFLAGS += -O1 diff --git a/testme.sh b/testme.sh index 089e42a70..92997a041 100755 --- a/testme.sh +++ b/testme.sh @@ -70,6 +70,8 @@ All other options will be tested with all MP_xBIT configurations. runtime and may trigger the 30 minutes timeout. + --multithread Run tests in multi-threaded mode (via pthread). + Godmode: --all Choose all architectures and gcc and clang @@ -128,7 +130,7 @@ _make() echo -ne " Compile $1 $2" suffix=$(echo ${1}${2} | tr ' ' '_') _fixup_cflags "$1" - CC="$1" CFLAGS="$2 $TEST_CFLAGS" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log + CC="$1" CFLAGS="$2 $TEST_CFLAGS" LFLAGS="$4" LDFLAGS="$5" make -j$MAKE_JOBS $3 $MAKE_OPTIONS 2>gcc_errors_${suffix}.log errcnt=$(wc -l < gcc_errors_${suffix}.log) if [[ ${errcnt} -gt 1 ]]; then echo " failed" @@ -148,10 +150,10 @@ _runtest() # "make tune" will run "tune_it.sh" automatically, hence "autotune", but it cannot # get switched off without some effort, so we just let it run twice for testing purposes echo -e "\rRun autotune $1 $2" - _make "$1" "$2" "" + _make "$1" "$2" "" "$3" "$4" $_timeout $TUNE_CMD > test_${suffix}.log || _die "running autotune" $? else - _make "$1" "$2" "test" + _make "$1" "$2" "test" "$3" "$4" echo -e "\rRun test $1 $2" $_timeout ./test > test_${suffix}.log || _die "running tests" $? fi @@ -171,13 +173,13 @@ echo "MAKE_OPTIONS = \"$MAKE_OPTIONS\"" if [[ "$MAKE_OPTIONS" =~ "tune" ]] then echo "autotune branch" - _make "$1" "$2" "" + _make "$1" "$2" "" "$3" "$4" # The shell used for /bin/sh is DASH 0.5.7-4ubuntu1 on the author's machine which fails valgrind, so # we just run on instance of etc/tune with the same options as in etc/tune_it.sh echo -e "\rRun etc/tune $1 $2 once inside valgrind" $_timeout $VALGRIND_BIN $VALGRIND_OPTS $TUNE_CMD > test_${suffix}.log || _die "running etc/tune" $? else - _make "$1" "$2" "test" + _make "$1" "$2" "test" "$3" "$4" echo -e "\rRun test $1 $2 inside valgrind" $_timeout $VALGRIND_BIN $VALGRIND_OPTS ./test > test_${suffix}.log || _die "running tests" $? fi @@ -301,6 +303,11 @@ do --symbols) CHECK_SYMBOLS="1" ;; + --multithread) + CFLAGS="$CFLAGS -DLTM_TEST_MULTITHREAD" + LFLAGS="$LFLAGS -pthread" + LDFLAGS="$LDFLAGS -pthread" + ;; --all) COMPILERS="gcc clang" ARCHFLAGS="-m64 -m32 -mx32" @@ -376,9 +383,9 @@ then _banner "$CC" if [[ "$VALGRIND_BIN" != "" ]] then - _runvalgrind "$CC" "" + _runvalgrind "$CC" "" "$LFLAGS" "$LDFLAGS" else - _runtest "$CC" "" + _runtest "$CC" "" "$LFLAGS" "$LDFLAGS" fi _exit fi @@ -398,9 +405,9 @@ _banner if [[ "$TEST_VS_MTEST" != "" ]] then make clean > /dev/null - _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent" + _make "${compilers[0]}" "${archflags[0]} $CFLAGS" "mtest_opponent" "$LFLAGS" "$LDFLAGS" echo - _make "gcc" "$MTEST_RAND" "mtest" + _make "gcc" "$MTEST_RAND" "mtest" "$LFLAGS" "$LDFLAGS" echo echo "Run test vs. mtest for $TEST_VS_MTEST iterations" _timeout="" @@ -429,15 +436,15 @@ do fi if [[ "$VALGRIND_BIN" != "" ]] then - _runvalgrind "$i" "$a $CFLAGS" + _runvalgrind "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS" [ "$WITH_LOW_MP" != "1" ] && continue - _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS" - _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS" + _runvalgrind "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS" + _runvalgrind "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS" else - _runtest "$i" "$a $CFLAGS" + _runtest "$i" "$a $CFLAGS" "$LFLAGS" "$LDFLAGS" [ "$WITH_LOW_MP" != "1" ] && continue - _runtest "$i" "$a -DMP_16BIT $CFLAGS" - _runtest "$i" "$a -DMP_32BIT $CFLAGS" + _runtest "$i" "$a -DMP_16BIT $CFLAGS" "$LFLAGS" "$LDFLAGS" + _runtest "$i" "$a -DMP_32BIT $CFLAGS" "$LFLAGS" "$LDFLAGS" fi done done From fae9aa56457a8cba2f1ca93270ee167d319f7ddc Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Mon, 25 Mar 2024 11:57:42 +0100 Subject: [PATCH 05/11] Add tests for MSVC multi-threading ... and fix some MSVC related (and other) things. Signed-off-by: Steffen Jaeckel --- appveyor.yml | 10 +++++- demo/test.c | 86 +++++++++++++++++++++++++++++++++++++++++------ makefile | 6 ++-- makefile.msvc | 2 +- s_mp_cmpexch_n.c | 3 +- s_mp_warray_get.c | 8 +++-- tommath_c89.h | 5 +++ 7 files changed, 101 insertions(+), 19 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 30d9ee757..5606d9abd 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,9 +18,17 @@ build_script: if "Visual Studio 2017"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64 + nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE" + copy /Y test.exe test_small_stack.exe + nmake -f makefile.msvc clean-obj + nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD" + copy /Y test.exe test_small_stack_multithreaded.exe + nmake -f makefile.msvc clean-obj nmake -f makefile.msvc test.exe nmake -f makefile.msvc clean-obj - nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /MD /DLTM_TEST_DYNAMIC" + nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /Oi /MD /DLTM_TEST_DYNAMIC" test_script: +- cmd: test_small_stack.exe +- cmd: test_small_stack_multithreaded.exe - cmd: test.exe - cmd: test_dll.exe diff --git a/demo/test.c b/demo/test.c index 306a9c28a..5ab9686c3 100644 --- a/demo/test.c +++ b/demo/test.c @@ -2455,14 +2455,40 @@ static int test_mp_pack_unpack(void) #define ONLY_PUBLIC_API_C #endif -#if !defined(LTM_TEST_MULTITHREAD) || !defined(MP_SMALL_STACK_SIZE) +#if !defined(LTM_TEST_MULTITHREAD) #define SINGLE_THREADED_C -typedef unsigned long int pthread_t; -extern int pthread_create(pthread_t *, const void *, void *(*)(void *), void *); -extern int pthread_join(pthread_t, void **); +typedef uintptr_t thread_id_t; #else #define MULTI_THREADED_C +#if !defined(_WIN32) +#define MULTI_THREADED_PTHREAD_C #include +typedef pthread_t thread_id_t; +#else +#define MULTI_THREADED_MSVC_C + +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0501 +#endif +#ifndef WINVER +#define WINVER 0x0501 +#endif + +#define WIN32_LEAN_AND_MEAN +#include +typedef HANDLE thread_id_t; +#endif +#endif + +#if !defined(MULTI_THREADED_PTHREAD_C) +extern int pthread_create(thread_id_t *, const void *, void *(*)(void *), void *); +extern int pthread_join(thread_id_t, void **); +#endif + +#if !defined(MULTI_THREADED_MSVC_C) +extern thread_id_t CreateThread(void *, size_t, unsigned long (*)(void *), void *, unsigned long, void *); +extern unsigned long WaitForSingleObject(thread_id_t hHandle, unsigned long dwMilliseconds); +#define INFINITE ((unsigned long)-1) #endif struct test_fn { @@ -2471,12 +2497,12 @@ struct test_fn { }; struct thread_info { - pthread_t thread_id; + thread_id_t thread_id; const struct test_fn *t; int ret; }; -static void *run(void *arg) +static void *run_pthread(void *arg) { struct thread_info *tinfo = arg; @@ -2485,6 +2511,38 @@ static void *run(void *arg) return arg; } +static unsigned long run_msvc(void *arg) +{ + struct thread_info *tinfo = arg; + + tinfo->ret = tinfo->t->fn(); + + return 0; +} + +static int thread_start(struct thread_info *info) +{ + if (MP_HAS(MULTI_THREADED_PTHREAD)) + return pthread_create(&info->thread_id, NULL, run_pthread, info); + if (MP_HAS(MULTI_THREADED_MSVC)) { + info->thread_id = CreateThread(NULL, 0, run_msvc, info, 0, NULL); + return info->thread_id == (thread_id_t)NULL ? -1 : 0; + } + return -1; +} + +static int thread_join(struct thread_info *info, struct thread_info **res) +{ + if (MP_HAS(MULTI_THREADED_PTHREAD)) + return pthread_join(info->thread_id, (void **)res); + if (MP_HAS(MULTI_THREADED_MSVC)) { + WaitForSingleObject(info->thread_id, INFINITE); + *res = info; + return 0; + } + return -1; +} + static int unit_tests(int argc, char **argv) { static const struct test_fn test[] = { @@ -2551,8 +2609,9 @@ static int unit_tests(int argc, char **argv) }; struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res; unsigned long i, ok, fail, nop; + size_t n_threads = MP_HAS(MULTI_THREADED) ? sizeof(test) / sizeof(test[0]) : 1; uint64_t t; - int j = -1; + int j; ok = fail = nop = 0; t = (uint64_t)time(NULL); @@ -2560,17 +2619,22 @@ static int unit_tests(int argc, char **argv) s_mp_rand_jenkins_init(t); mp_rand_source(s_mp_rand_jenkins); + if (MP_HAS(MP_SMALL_STACK_SIZE)) { + printf("Small-stack enabled with %zu warray buffers\n\n", n_threads); + DO(mp_warray_init(n_threads, 1)); + } + if (MP_HAS(MULTI_THREADED)) { printf("Multi-threading enabled\n\n"); - DO(mp_warray_init(sizeof(test) / sizeof(test[0]), 1)); - /* we ignore the fact that jenkings is not thread safe */ + /* we ignore the fact that jenkins is not thread safe */ for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) { test_threads[i].t = &test[i]; - EXPECT(pthread_create(&test_threads[i].thread_id, NULL, run, &test_threads[i]) == 0); + EXPECT(thread_start(&test_threads[i]) == 0); } } for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) { + j = -1; if (MP_HAS(SINGLE_THREADED)) { if (argc > 1) { for (j = 1; j < argc; ++j) { @@ -2584,7 +2648,7 @@ static int unit_tests(int argc, char **argv) if (test[i].fn) j = test[i].fn(); } else if (MP_HAS(MULTI_THREADED)) { - EXPECT(pthread_join(test_threads[i].thread_id, (void **)&res) == 0); + EXPECT(thread_join(&test_threads[i], &res) == 0); j = res->ret; } printf("TEST %s\n", test[i].name); diff --git a/makefile b/makefile index ec32ecd09..437e3d4e9 100644 --- a/makefile +++ b/makefile @@ -172,9 +172,10 @@ c89: -e 's/UINT32_MAX/0xFFFFFFFFu/g' \ -e 's/UINT64_MAX/(mp_u64)-1/g' \ -e 's/INT32_MAX/0x7FFFFFFF/g' \ - -e 's/INT32_MIN/(-2147483647-1)/g' \ + -e 's/INT32_MIN/(-2147483647-1)/g' \ -e 's/INT64_MAX/(mp_i64)(((mp_u64)1<<63)-1)/g' \ -e 's/INT64_MIN/(mp_i64)((mp_u64)1<<63)/g' \ + -e 's/uintptr_t/mp_uintptr/g' \ -e 's/SIZE_MAX/((size_t)-1)/g' \ -e 's/\(PRI[ioux]64\)/MP_\1/g' \ -e 's/uint\([0-9][0-9]*\)_t/mp_u\1/g' \ @@ -195,10 +196,11 @@ c99: -e 's/false_/MP_NO_/g' \ -e 's/0xFFFFFFFFu/UINT32_MAX/g' \ -e 's/(mp_u64)-1/UINT64_MAX/g' \ - -e 's/(-2147483647-1)/INT32_MIN/g' \ + -e 's/(-2147483647-1)/INT32_MIN/g' \ -e 's/0x7FFFFFFF/INT32_MAX/g' \ -e 's/(mp_i64)((mp_u64)1<<63)/INT64_MIN/g' \ -e 's/(mp_i64)(((mp_u64)1<<63)-1)/INT64_MAX/g' \ + -e 's/mp_uintptr/uintptr_t/g' \ -e 's/((size_t)-1)/SIZE_MAX/g' \ -e 's/MP_\(PRI[ioux]64\)/\1/g' \ -e 's/mp_u\([0-9][0-9]*\)/uint\1_t/g' \ diff --git a/makefile.msvc b/makefile.msvc index 5d1285490..da5e2fd62 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -11,7 +11,7 @@ #The following can be overridden from command line e.g. make -f makefile.msvc CC=gcc ARFLAGS=rcs PREFIX = c:\devel -CFLAGS = /Ox +CFLAGS = /Ox /Oi LDFLAGS = #Compilation flags diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c index 6334d9d4b..e8ef969c2 100644 --- a/s_mp_cmpexch_n.c +++ b/s_mp_cmpexch_n.c @@ -26,8 +26,7 @@ static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired) static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired) { - InterlockedCompareExchangePointer(ptr, desired, *(expected)); - return *ptr == desired; + return InterlockedCompareExchangePointer(ptr, desired, *(expected)); } #endif diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c index 068e8145d..39176eb2c 100644 --- a/s_mp_warray_get.c +++ b/s_mp_warray_get.c @@ -11,14 +11,18 @@ void *s_mp_warray_get(void) if (mp_warray_init(1, false) != MP_OKAY) return NULL; } - for (n = 0; n < s_mp_warray.allocated; ++n) { - if (s_mp_warray.l_free[n].warray == NULL) + for (n = 0; n < s_mp_warray.allocated;) { + if (s_mp_warray.l_free[n].warray == NULL) { + n++; continue; + } ret = s_mp_warray.l_free[n].warray; if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) { s_mp_warray.l_used[n].warray = ret; goto LBL_OUT; } + /* restart from the beginning if we missed a potential slot */ + n = 0; } ret = NULL; if (s_mp_warray.allocated + 1 > s_mp_warray.usable) diff --git a/tommath_c89.h b/tommath_c89.h index 49400a131..22436366b 100644 --- a/tommath_c89.h +++ b/tommath_c89.h @@ -26,6 +26,11 @@ typedef __UINT8_TYPE__ mp_u8; typedef __UINT16_TYPE__ mp_u16; typedef __UINT32_TYPE__ mp_u32; typedef __UINT64_TYPE__ mp_u64; +# if __WORDSIZE == 64 +typedef __UINT64_TYPE__ mp_uintptr; +# else +typedef __UINT32_TYPE__ mp_uintptr; +# endif /* inttypes.h replacement, printf format specifier */ # if __WORDSIZE == 64 From 334465dd1773832d224175fdb252c741e479d32e Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Wed, 27 Mar 2024 14:49:09 +0100 Subject: [PATCH 06/11] Update makefiles Signed-off-by: Steffen Jaeckel --- libtommath_VS2008.vcproj | 28 ++++++++++++++++++++++++++++ makefile | 15 ++++++++------- makefile.mingw | 15 ++++++++------- makefile.msvc | 15 ++++++++------- makefile.shared | 15 ++++++++------- makefile.unix | 15 ++++++++------- sources.cmake | 7 +++++++ 7 files changed, 75 insertions(+), 35 deletions(-) diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj index 13158a09d..816217e8d 100644 --- a/libtommath_VS2008.vcproj +++ b/libtommath_VS2008.vcproj @@ -792,6 +792,14 @@ RelativePath="mp_unpack.c" > + + + + @@ -804,6 +812,10 @@ RelativePath="s_mp_add.c" > + + @@ -928,6 +940,22 @@ RelativePath="s_mp_sub.c" > + + + + + + + + diff --git a/makefile b/makefile index 437e3d4e9..a1729d7f0 100644 --- a/makefile +++ b/makefile @@ -43,13 +43,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \ -s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \ -s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \ -s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \ -s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \ -s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \ -s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ +s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ +s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ +s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ +s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ +s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ +s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ +s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o #END_INS diff --git a/makefile.mingw b/makefile.mingw index 532747be0..7597ba6df 100644 --- a/makefile.mingw +++ b/makefile.mingw @@ -45,13 +45,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \ -s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \ -s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \ -s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \ -s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \ -s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \ -s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ +s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ +s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ +s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ +s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ +s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ +s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ +s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o HEADERS_PUB=tommath.h HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB) diff --git a/makefile.msvc b/makefile.msvc index da5e2fd62..e6e8db7fe 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -41,13 +41,14 @@ mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_ mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \ mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \ mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \ -mp_unpack.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj \ -s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj \ -s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj \ -s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj \ -s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj s_mp_radix_map.obj \ -s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj \ -s_mp_sqr_toom.obj s_mp_sub.obj s_mp_zero_buf.obj s_mp_zero_digs.obj +mp_unpack.obj mp_warray_free.obj mp_warray_init.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_cmpexch_n.obj \ +s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj \ +s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj \ +s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj \ +s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj \ +s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj \ +s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_free.obj \ +s_mp_warray_get.obj s_mp_warray_put.obj s_mp_zero_buf.obj s_mp_zero_digs.obj HEADERS_PUB=tommath.h HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB) diff --git a/makefile.shared b/makefile.shared index c9b933513..315252f35 100644 --- a/makefile.shared +++ b/makefile.shared @@ -40,13 +40,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \ -s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \ -s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \ -s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \ -s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \ -s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \ -s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ +s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ +s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ +s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ +s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ +s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ +s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ +s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o #END_INS diff --git a/makefile.unix b/makefile.unix index 34ebd1a86..c74ec5d7b 100644 --- a/makefile.unix +++ b/makefile.unix @@ -46,13 +46,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o \ -s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o \ -s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o \ -s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o \ -s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o s_mp_radix_map.o \ -s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o s_mp_sqr_karatsuba.o \ -s_mp_sqr_toom.o s_mp_sub.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ +s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ +s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ +s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ +s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ +s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ +s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ +s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o HEADERS_PUB=tommath.h diff --git a/sources.cmake b/sources.cmake index bbb2aeab6..a23dbd451 100644 --- a/sources.cmake +++ b/sources.cmake @@ -122,9 +122,12 @@ mp_to_sbin.c mp_to_ubin.c mp_ubin_size.c mp_unpack.c +mp_warray_free.c +mp_warray_init.c mp_xor.c mp_zero.c s_mp_add.c +s_mp_cmpexch_n.c s_mp_copy_digs.c s_mp_div_3.c s_mp_div_recursive.c @@ -156,6 +159,10 @@ s_mp_sqr_comba.c s_mp_sqr_karatsuba.c s_mp_sqr_toom.c s_mp_sub.c +s_mp_warray.c +s_mp_warray_free.c +s_mp_warray_get.c +s_mp_warray_put.c s_mp_zero_buf.c s_mp_zero_digs.c ) From 33a9d0d5959abda6cca4fcc0fa9cd71aa653088d Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Fri, 29 Mar 2024 10:50:28 +0100 Subject: [PATCH 07/11] Use Thread Local Storage for `warray` buffer Signed-off-by: Steffen Jaeckel --- demo/test.c | 20 +++++++++++--------- mp_warray_free.c | 18 ++++++------------ mp_warray_init.c | 46 ---------------------------------------------- s_mp_cmpexch_n.c | 43 ------------------------------------------- s_mp_warray.c | 2 +- s_mp_warray_free.c | 17 ----------------- s_mp_warray_get.c | 34 +++++++--------------------------- s_mp_warray_put.c | 12 ++++-------- tommath.h | 1 - tommath_private.h | 17 ++++++++--------- 10 files changed, 37 insertions(+), 173 deletions(-) delete mode 100644 mp_warray_init.c delete mode 100644 s_mp_cmpexch_n.c delete mode 100644 s_mp_warray_free.c diff --git a/demo/test.c b/demo/test.c index 5ab9686c3..2fa6e08db 100644 --- a/demo/test.c +++ b/demo/test.c @@ -2502,20 +2502,24 @@ struct thread_info { int ret; }; -static void *run_pthread(void *arg) +static void run(struct thread_info *tinfo) { - struct thread_info *tinfo = arg; - tinfo->ret = tinfo->t->fn(); + if (mp_warray_free() == -2) + tinfo->ret = EXIT_FAILURE; +} + +static void *run_pthread(void *arg) +{ + run(arg); + return arg; } static unsigned long run_msvc(void *arg) { - struct thread_info *tinfo = arg; - - tinfo->ret = tinfo->t->fn(); + run(arg); return 0; } @@ -2609,7 +2613,6 @@ static int unit_tests(int argc, char **argv) }; struct thread_info test_threads[sizeof(test)/sizeof(test[0])], *res; unsigned long i, ok, fail, nop; - size_t n_threads = MP_HAS(MULTI_THREADED) ? sizeof(test) / sizeof(test[0]) : 1; uint64_t t; int j; ok = fail = nop = 0; @@ -2620,8 +2623,7 @@ static int unit_tests(int argc, char **argv) mp_rand_source(s_mp_rand_jenkins); if (MP_HAS(MP_SMALL_STACK_SIZE)) { - printf("Small-stack enabled with %zu warray buffers\n\n", n_threads); - DO(mp_warray_init(n_threads, 1)); + printf("Small-stack enabled\n\n"); } if (MP_HAS(MULTI_THREADED)) { diff --git a/mp_warray_free.c b/mp_warray_free.c index 088efefc4..f7470f818 100644 --- a/mp_warray_free.c +++ b/mp_warray_free.c @@ -9,19 +9,13 @@ MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) static int s_warray_free(void) { int ret = 0; - size_t n; - for (n = 0; n < s_mp_warray.allocated; ++n) { - if (s_mp_warray.l_used[n].warray) { - ret = -2; - goto ERR_OUT; - } + if (s_mp_warray.w_used) + return -2; + if (s_mp_warray.w_free) { + s_mp_zero_buf(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY); + MP_FREE(s_mp_warray.w_free, sizeof(mp_word) * MP_WARRAY); + s_mp_warray.w_free = NULL; } - for (n = 0; n < s_mp_warray.allocated; ++n) { - MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY); - s_mp_warray.l_free[n].warray = NULL; - } - s_mp_warray_free(s_mp_warray.usable); -ERR_OUT: return ret; } diff --git a/mp_warray_init.c b/mp_warray_init.c deleted file mode 100644 index c25098861..000000000 --- a/mp_warray_init.c +++ /dev/null @@ -1,46 +0,0 @@ -#include "tommath_private.h" -#ifdef MP_WARRAY_INIT_C -/* LibTomMath, multiple-precision integer library -- Tom St Denis */ -/* SPDX-License-Identifier: Unlicense */ - -static mp_err s_warray_init(size_t n_alloc, bool preallocate) -{ - size_t n; - if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) { - return MP_VAL; - } - - s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free))); - s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used))); - if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) { - s_mp_warray_free(n_alloc); - return MP_MEM; - } - - if (preallocate) { - for (n = 0; n < n_alloc; ++n) { - s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); - if (s_mp_warray.l_free[n].warray == NULL) { - while (n > 0) { - n--; - MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word)); - s_mp_warray.l_free[n].warray = NULL; - } - s_mp_warray_free(n_alloc); - return MP_MEM; - } - } - s_mp_warray.allocated = n_alloc; - } - - s_mp_warray.usable = n_alloc; - return MP_OKAY; -} - -mp_err mp_warray_init(size_t n_alloc, bool preallocate) -{ - if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate); - return MP_ERR; -} - -#endif diff --git a/s_mp_cmpexch_n.c b/s_mp_cmpexch_n.c deleted file mode 100644 index e8ef969c2..000000000 --- a/s_mp_cmpexch_n.c +++ /dev/null @@ -1,43 +0,0 @@ -#include "tommath_private.h" -#ifdef S_MP_CMPEXCH_N_C -/* LibTomMath, multiple-precision integer library -- Tom St Denis */ -/* SPDX-License-Identifier: Unlicense */ - -#ifdef __GNUC__ -#define S_CMPEXCH_N_GCC_C -static bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired) -{ - return __atomic_compare_exchange_n(ptr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE); -} -#endif - -#ifdef _MSC_VER -#define S_CMPEXCH_N_MSVC_C - -#ifndef _WIN32_WINNT -#define _WIN32_WINNT 0x0501 -#endif -#ifndef WINVER -#define WINVER 0x0501 -#endif - -#define WIN32_LEAN_AND_MEAN -#include - -static bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired) -{ - return InterlockedCompareExchangePointer(ptr, desired, *(expected)); -} -#endif - -bool s_cmpexch_n_gcc(void **ptr, void **expected, void *desired); -bool s_cmpexch_n_msvc(void **ptr, void **expected, void *desired); - -bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired) -{ - if (MP_HAS(S_CMPEXCH_N_GCC)) return s_cmpexch_n_gcc(ptr, expected, desired); - if (MP_HAS(S_CMPEXCH_N_MSVC)) return s_cmpexch_n_msvc(ptr, expected, desired); - return false; -} - -#endif diff --git a/s_mp_warray.c b/s_mp_warray.c index d181057cb..1b8b068b7 100644 --- a/s_mp_warray.c +++ b/s_mp_warray.c @@ -3,6 +3,6 @@ /* LibTomMath, multiple-precision integer library -- Tom St Denis */ /* SPDX-License-Identifier: Unlicense */ -st_warray s_mp_warray; +mp_thread st_warray s_mp_warray = { 0 }; #endif diff --git a/s_mp_warray_free.c b/s_mp_warray_free.c deleted file mode 100644 index 9d8b75eb1..000000000 --- a/s_mp_warray_free.c +++ /dev/null @@ -1,17 +0,0 @@ -#include "tommath_private.h" -#ifdef S_MP_WARRAY_FREE_C -/* LibTomMath, multiple-precision integer library -- Tom St Denis */ -/* SPDX-License-Identifier: Unlicense */ - -void s_mp_warray_free(size_t n) -{ - (void)n; - MP_FREE(s_mp_warray.l_free, n * sizeof(*(s_mp_warray.l_free))); - MP_FREE(s_mp_warray.l_used, n * sizeof(*(s_mp_warray.l_used))); - s_mp_warray.l_free = NULL; - s_mp_warray.l_used = NULL; - s_mp_warray.allocated = 0; - s_mp_warray.usable = 0; -} - -#endif diff --git a/s_mp_warray_get.c b/s_mp_warray_get.c index 39176eb2c..26b0d7c10 100644 --- a/s_mp_warray_get.c +++ b/s_mp_warray_get.c @@ -5,34 +5,14 @@ void *s_mp_warray_get(void) { - void *ret = NULL; - size_t n; - if (s_mp_warray.usable == 0) { - if (mp_warray_init(1, false) != MP_OKAY) - return NULL; + if (s_mp_warray.w_used) + return NULL; + if (s_mp_warray.w_free == NULL) { + s_mp_warray.w_free = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); } - for (n = 0; n < s_mp_warray.allocated;) { - if (s_mp_warray.l_free[n].warray == NULL) { - n++; - continue; - } - ret = s_mp_warray.l_free[n].warray; - if (s_mp_cmpexch_n(&s_mp_warray.l_free[n].warray, &ret, NULL)) { - s_mp_warray.l_used[n].warray = ret; - goto LBL_OUT; - } - /* restart from the beginning if we missed a potential slot */ - n = 0; - } - ret = NULL; - if (s_mp_warray.allocated + 1 > s_mp_warray.usable) - goto LBL_OUT; - ret = MP_CALLOC(MP_WARRAY, sizeof(mp_word)); - if (ret != NULL) - s_mp_warray.l_used[s_mp_warray.allocated++].warray = ret; - -LBL_OUT: - return ret; + s_mp_warray.w_used = s_mp_warray.w_free; + s_mp_warray.w_free = NULL; + return s_mp_warray.w_used; } #endif diff --git a/s_mp_warray_put.c b/s_mp_warray_put.c index 4cf413d62..79e014acd 100644 --- a/s_mp_warray_put.c +++ b/s_mp_warray_put.c @@ -5,14 +5,10 @@ void s_mp_warray_put(void *w) { - size_t n, allocated = s_mp_warray.allocated; - for (n = 0; n < allocated; ++n) { - if (s_mp_warray.l_used[n].warray == w) { - s_mp_warray.l_used[n].warray = NULL; - s_mp_warray.l_free[n].warray = w; - break; - } - } + if (s_mp_warray.w_free || s_mp_warray.w_used != w) + return; + s_mp_warray.w_free = w; + s_mp_warray.w_used = NULL; } #endif diff --git a/tommath.h b/tommath.h index 7da36d0e9..1820d2436 100644 --- a/tommath.h +++ b/tommath.h @@ -588,7 +588,6 @@ mp_err mp_fread(mp_int *a, int radix, FILE *stream) MP_WUR; mp_err mp_fwrite(const mp_int *a, int radix, FILE *stream) MP_WUR; #endif -mp_err mp_warray_init(size_t n_alloc, bool preallocate); int mp_warray_free(void); #define mp_to_binary(M, S, N) mp_to_radix((M), (S), (N), NULL, 2) diff --git a/tommath_private.h b/tommath_private.h index 36291a061..9c25f330f 100644 --- a/tommath_private.h +++ b/tommath_private.h @@ -234,8 +234,6 @@ MP_PRIVATE mp_err s_mp_radix_size_overestimate(const mp_int *a, const int radix, MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR; MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; -MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired); - #ifdef MP_SMALL_STACK_SIZE #define MP_SMALL_STACK_SIZE_C #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get() @@ -247,19 +245,20 @@ MP_PRIVATE bool s_mp_cmpexch_n(void **ptr, void **expected, void *desired); #define MP_CHECK_WARRAY(name) #endif -struct warray { - void *warray; -}; +#if defined(_MSC_VER) +#define mp_thread __declspec(thread) +#elif defined(__GNUC__) +#define mp_thread __thread +#endif + typedef struct { - struct warray *l_free, *l_used; - size_t allocated, usable; + void *w_free, *w_used; } st_warray; -extern MP_PRIVATE st_warray s_mp_warray; +extern MP_PRIVATE mp_thread st_warray s_mp_warray; MP_PRIVATE void *s_mp_warray_get(void); MP_PRIVATE void s_mp_warray_put(void *w); -MP_PRIVATE void s_mp_warray_free(size_t n); #define MP_RADIX_MAP_REVERSE_SIZE 80u extern MP_PRIVATE const char s_mp_radix_map[]; From 513d48d404c77f6886297828e33c4002541aac10 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Fri, 29 Mar 2024 10:50:45 +0100 Subject: [PATCH 08/11] regen Signed-off-by: Steffen Jaeckel --- libtommath_VS2008.vcproj | 12 ------------ makefile | 16 ++++++++-------- makefile.mingw | 16 ++++++++-------- makefile.msvc | 16 ++++++++-------- makefile.shared | 16 ++++++++-------- makefile.unix | 16 ++++++++-------- sources.cmake | 3 --- tommath.def | 1 - tommath_class.h | 17 +---------------- 9 files changed, 41 insertions(+), 72 deletions(-) diff --git a/libtommath_VS2008.vcproj b/libtommath_VS2008.vcproj index 816217e8d..71dd3807f 100644 --- a/libtommath_VS2008.vcproj +++ b/libtommath_VS2008.vcproj @@ -796,10 +796,6 @@ RelativePath="mp_warray_free.c" > - - @@ -812,10 +808,6 @@ RelativePath="s_mp_add.c" > - - @@ -944,10 +936,6 @@ RelativePath="s_mp_warray.c" > - - diff --git a/makefile b/makefile index a1729d7f0..8f211f5f2 100644 --- a/makefile +++ b/makefile @@ -43,14 +43,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ -s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ -s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ -s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ -s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ -s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ -s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ -s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \ +s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \ +s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \ +s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \ +s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \ +s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \ +s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \ +s_mp_zero_buf.o s_mp_zero_digs.o #END_INS diff --git a/makefile.mingw b/makefile.mingw index 7597ba6df..e2445e8a0 100644 --- a/makefile.mingw +++ b/makefile.mingw @@ -45,14 +45,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ -s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ -s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ -s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ -s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ -s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ -s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ -s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \ +s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \ +s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \ +s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \ +s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \ +s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \ +s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \ +s_mp_zero_buf.o s_mp_zero_digs.o HEADERS_PUB=tommath.h HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB) diff --git a/makefile.msvc b/makefile.msvc index e6e8db7fe..8540ca33d 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -41,14 +41,14 @@ mp_reduce_2k_l.obj mp_reduce_2k_setup.obj mp_reduce_2k_setup_l.obj mp_reduce_is_ mp_reduce_setup.obj mp_root_n.obj mp_rshd.obj mp_sbin_size.obj mp_set.obj mp_set_double.obj mp_set_i32.obj mp_set_i64.obj \ mp_set_l.obj mp_set_u32.obj mp_set_u64.obj mp_set_ul.obj mp_shrink.obj mp_signed_rsh.obj mp_sqrmod.obj mp_sqrt.obj \ mp_sqrtmod_prime.obj mp_sub.obj mp_sub_d.obj mp_submod.obj mp_to_radix.obj mp_to_sbin.obj mp_to_ubin.obj mp_ubin_size.obj \ -mp_unpack.obj mp_warray_free.obj mp_warray_init.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_cmpexch_n.obj \ -s_mp_copy_digs.obj s_mp_div_3.obj s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj \ -s_mp_exptmod_fast.obj s_mp_fp_log.obj s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj \ -s_mp_log_2expt.obj s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj \ -s_mp_mul_high.obj s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj \ -s_mp_prime_tab.obj s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj \ -s_mp_sqr_comba.obj s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_free.obj \ -s_mp_warray_get.obj s_mp_warray_put.obj s_mp_zero_buf.obj s_mp_zero_digs.obj +mp_unpack.obj mp_warray_free.obj mp_xor.obj mp_zero.obj s_mp_add.obj s_mp_copy_digs.obj s_mp_div_3.obj \ +s_mp_div_recursive.obj s_mp_div_school.obj s_mp_div_small.obj s_mp_exptmod.obj s_mp_exptmod_fast.obj s_mp_fp_log.obj \ +s_mp_fp_log_d.obj s_mp_get_bit.obj s_mp_invmod.obj s_mp_invmod_odd.obj s_mp_log_2expt.obj \ +s_mp_montgomery_reduce_comba.obj s_mp_mul.obj s_mp_mul_balance.obj s_mp_mul_comba.obj s_mp_mul_high.obj \ +s_mp_mul_high_comba.obj s_mp_mul_karatsuba.obj s_mp_mul_toom.obj s_mp_prime_is_divisible.obj s_mp_prime_tab.obj \ +s_mp_radix_map.obj s_mp_radix_size_overestimate.obj s_mp_rand_platform.obj s_mp_sqr.obj s_mp_sqr_comba.obj \ +s_mp_sqr_karatsuba.obj s_mp_sqr_toom.obj s_mp_sub.obj s_mp_warray.obj s_mp_warray_get.obj s_mp_warray_put.obj \ +s_mp_zero_buf.obj s_mp_zero_digs.obj HEADERS_PUB=tommath.h HEADERS=tommath_private.h tommath_class.h tommath_superclass.h tommath_cutoffs.h $(HEADERS_PUB) diff --git a/makefile.shared b/makefile.shared index 315252f35..50c335269 100644 --- a/makefile.shared +++ b/makefile.shared @@ -40,14 +40,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ -s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ -s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ -s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ -s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ -s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ -s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ -s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \ +s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \ +s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \ +s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \ +s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \ +s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \ +s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \ +s_mp_zero_buf.o s_mp_zero_digs.o #END_INS diff --git a/makefile.unix b/makefile.unix index c74ec5d7b..58642098d 100644 --- a/makefile.unix +++ b/makefile.unix @@ -46,14 +46,14 @@ mp_reduce_2k_l.o mp_reduce_2k_setup.o mp_reduce_2k_setup_l.o mp_reduce_is_2k.o m mp_reduce_setup.o mp_root_n.o mp_rshd.o mp_sbin_size.o mp_set.o mp_set_double.o mp_set_i32.o mp_set_i64.o \ mp_set_l.o mp_set_u32.o mp_set_u64.o mp_set_ul.o mp_shrink.o mp_signed_rsh.o mp_sqrmod.o mp_sqrt.o \ mp_sqrtmod_prime.o mp_sub.o mp_sub_d.o mp_submod.o mp_to_radix.o mp_to_sbin.o mp_to_ubin.o mp_ubin_size.o \ -mp_unpack.o mp_warray_free.o mp_warray_init.o mp_xor.o mp_zero.o s_mp_add.o s_mp_cmpexch_n.o \ -s_mp_copy_digs.o s_mp_div_3.o s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o \ -s_mp_exptmod_fast.o s_mp_fp_log.o s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o \ -s_mp_log_2expt.o s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o \ -s_mp_mul_high.o s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o \ -s_mp_prime_tab.o s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o \ -s_mp_sqr_comba.o s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_free.o \ -s_mp_warray_get.o s_mp_warray_put.o s_mp_zero_buf.o s_mp_zero_digs.o +mp_unpack.o mp_warray_free.o mp_xor.o mp_zero.o s_mp_add.o s_mp_copy_digs.o s_mp_div_3.o \ +s_mp_div_recursive.o s_mp_div_school.o s_mp_div_small.o s_mp_exptmod.o s_mp_exptmod_fast.o s_mp_fp_log.o \ +s_mp_fp_log_d.o s_mp_get_bit.o s_mp_invmod.o s_mp_invmod_odd.o s_mp_log_2expt.o \ +s_mp_montgomery_reduce_comba.o s_mp_mul.o s_mp_mul_balance.o s_mp_mul_comba.o s_mp_mul_high.o \ +s_mp_mul_high_comba.o s_mp_mul_karatsuba.o s_mp_mul_toom.o s_mp_prime_is_divisible.o s_mp_prime_tab.o \ +s_mp_radix_map.o s_mp_radix_size_overestimate.o s_mp_rand_platform.o s_mp_sqr.o s_mp_sqr_comba.o \ +s_mp_sqr_karatsuba.o s_mp_sqr_toom.o s_mp_sub.o s_mp_warray.o s_mp_warray_get.o s_mp_warray_put.o \ +s_mp_zero_buf.o s_mp_zero_digs.o HEADERS_PUB=tommath.h diff --git a/sources.cmake b/sources.cmake index a23dbd451..103e9c099 100644 --- a/sources.cmake +++ b/sources.cmake @@ -123,11 +123,9 @@ mp_to_ubin.c mp_ubin_size.c mp_unpack.c mp_warray_free.c -mp_warray_init.c mp_xor.c mp_zero.c s_mp_add.c -s_mp_cmpexch_n.c s_mp_copy_digs.c s_mp_div_3.c s_mp_div_recursive.c @@ -160,7 +158,6 @@ s_mp_sqr_karatsuba.c s_mp_sqr_toom.c s_mp_sub.c s_mp_warray.c -s_mp_warray_free.c s_mp_warray_get.c s_mp_warray_put.c s_mp_zero_buf.c diff --git a/tommath.def b/tommath.def index 7aa5860f7..ed5aa8b0c 100644 --- a/tommath.def +++ b/tommath.def @@ -126,7 +126,6 @@ EXPORTS mp_ubin_size mp_unpack mp_warray_free - mp_warray_init mp_xor mp_zero MP_MUL_KARATSUBA_CUTOFF diff --git a/tommath_class.h b/tommath_class.h index bb89cc237..09bb3ea63 100644 --- a/tommath_class.h +++ b/tommath_class.h @@ -132,11 +132,9 @@ # define MP_UBIN_SIZE_C # define MP_UNPACK_C # define MP_WARRAY_FREE_C -# define MP_WARRAY_INIT_C # define MP_XOR_C # define MP_ZERO_C # define S_MP_ADD_C -# define S_MP_CMPEXCH_N_C # define S_MP_COPY_DIGS_C # define S_MP_DIV_3_C # define S_MP_DIV_RECURSIVE_C @@ -169,7 +167,6 @@ # define S_MP_SQR_TOOM_C # define S_MP_SUB_C # define S_MP_WARRAY_C -# define S_MP_WARRAY_FREE_C # define S_MP_WARRAY_GET_C # define S_MP_WARRAY_PUT_C # define S_MP_ZERO_BUF_C @@ -965,11 +962,7 @@ #endif #if defined(MP_WARRAY_FREE_C) -# define S_MP_WARRAY_FREE_C -#endif - -#if defined(MP_WARRAY_INIT_C) -# define S_MP_WARRAY_FREE_C +# define S_MP_ZERO_BUF_C #endif #if defined(MP_XOR_C) @@ -987,9 +980,6 @@ # define S_MP_ZERO_DIGS_C #endif -#if defined(S_MP_CMPEXCH_N_C) -#endif - #if defined(S_MP_COPY_DIGS_C) #endif @@ -1308,12 +1298,7 @@ #if defined(S_MP_WARRAY_C) #endif -#if defined(S_MP_WARRAY_FREE_C) -#endif - #if defined(S_MP_WARRAY_GET_C) -# define MP_WARRAY_INIT_C -# define S_MP_CMPEXCH_N_C #endif #if defined(S_MP_WARRAY_PUT_C) From 4c2e17793856bb3f5280a379079144b3f3fc2af5 Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Fri, 29 Mar 2024 11:10:19 +0100 Subject: [PATCH 09/11] Use appveyor build matrix Signed-off-by: Steffen Jaeckel --- appveyor.yml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 5606d9abd..b2e2d3907 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -11,6 +11,16 @@ image: - Visual Studio 2019 - Visual Studio 2017 - Visual Studio 2015 +environment: + matrix: + - CFLAGS_VAR: "" + CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC\"" + + - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE\"" + CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE\"" + + - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\"" + CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\"" build_script: - cmd: >- if "Visual Studio 2022"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" @@ -18,17 +28,9 @@ build_script: if "Visual Studio 2017"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 if "Visual Studio 2015"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64 - nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE" - copy /Y test.exe test_small_stack.exe + nmake -f makefile.msvc test.exe %CFLAGS_VAR% nmake -f makefile.msvc clean-obj - nmake -f makefile.msvc test.exe CFLAGS="/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD" - copy /Y test.exe test_small_stack_multithreaded.exe - nmake -f makefile.msvc clean-obj - nmake -f makefile.msvc test.exe - nmake -f makefile.msvc clean-obj - nmake -f makefile.msvc test_dll.exe CFLAGS="/Ox /Oi /MD /DLTM_TEST_DYNAMIC" + nmake -f makefile.msvc test_dll.exe %CFLAGS_VAR_DLL% test_script: -- cmd: test_small_stack.exe -- cmd: test_small_stack_multithreaded.exe - cmd: test.exe - cmd: test_dll.exe From 6c70ef1cec6fe26f4e9d316f72b0b2134268634b Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Fri, 29 Mar 2024 12:15:32 +0100 Subject: [PATCH 10/11] Run small-stack tests with valgrind Signed-off-by: Steffen Jaeckel --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fd8e34cf5..18a832bbe 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,9 +72,9 @@ jobs: # Build with small stack-size - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } - - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } - - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' } - - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --limit-valgrind', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 libc6-dev-i386 gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread --limit-valgrind', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'libc6-dev-i386 gcc-multilib' } + - { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --multithread', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' } # Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs. #- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune' From 3570e12884d54e1503fda03569517530c01f5ddd Mon Sep 17 00:00:00 2001 From: Steffen Jaeckel Date: Wed, 3 Apr 2024 15:48:23 +0200 Subject: [PATCH 11/11] Disable MP_SMALL_STACK_SIZE on MSVC Signed-off-by: Steffen Jaeckel --- appveyor.yml | 8 +------- makefile.msvc | 2 +- tommath_private.h | 23 +++++++++++++++++++---- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index b2e2d3907..2134f2ddf 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -14,13 +14,7 @@ image: environment: matrix: - CFLAGS_VAR: "" - CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC\"" - - - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE\"" - CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE\"" - - - CFLAGS_VAR: "CFLAGS=\"/Ox /Oi /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\"" - CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /Oi /MD /DLTM_TEST_DYNAMIC /DMP_SMALL_STACK_SIZE /DLTM_TEST_MULTITHREAD\"" + CFLAGS_VAR_DLL: "CFLAGS=\"/Ox /MD /DLTM_TEST_DYNAMIC\"" build_script: - cmd: >- if "Visual Studio 2022"=="%APPVEYOR_BUILD_WORKER_IMAGE%" call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" diff --git a/makefile.msvc b/makefile.msvc index 8540ca33d..8feb425c4 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -11,7 +11,7 @@ #The following can be overridden from command line e.g. make -f makefile.msvc CC=gcc ARFLAGS=rcs PREFIX = c:\devel -CFLAGS = /Ox /Oi +CFLAGS = /Ox LDFLAGS = #Compilation flags diff --git a/tommath_private.h b/tommath_private.h index 9c25f330f..be620dbc9 100644 --- a/tommath_private.h +++ b/tommath_private.h @@ -235,6 +235,23 @@ MP_PRIVATE mp_err s_mp_fp_log(const mp_int *a, mp_int *c) MP_WUR; MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; #ifdef MP_SMALL_STACK_SIZE + +#if defined(__GNUC__) +/* We use TLS (Thread Local Storage) to manage the instance of the WARRAY + * per thread. + * The compilers we're usually looking at are GCC, Clang and MSVC. + * Both GCC and Clang are straight-forward with TLS, so it's enabled there. + * Using MSVC the tests were OK with the static library, but failed when + * the library was built as a DLL. As a result we completely disable + * support for MSVC. + * If your compiler can handle TLS properly without too much hocus pocus, + * feel free to open a PR to add support for it. + */ +#define mp_thread __thread +#else +#error "MP_SMALL_STACK_SIZE not supported with your compiler" +#endif + #define MP_SMALL_STACK_SIZE_C #define MP_ALLOC_WARRAY(name) *name = s_mp_warray_get() #define MP_FREE_WARRAY(name) s_mp_warray_put(name) @@ -245,10 +262,8 @@ MP_PRIVATE mp_err s_mp_fp_log_d(const mp_int *a, mp_word *c) MP_WUR; #define MP_CHECK_WARRAY(name) #endif -#if defined(_MSC_VER) -#define mp_thread __declspec(thread) -#elif defined(__GNUC__) -#define mp_thread __thread +#ifndef mp_thread +#define mp_thread #endif typedef struct {