Skip to content

Commit 5f689b7

Browse files
committed
add MP_SMALL_STACK_SIZE option
This adds an option to use a heap-buffer for the usually stack-based `MP_WARRAY`-sized temporary buffers. Per default it will reserve a single buffer, which can be modified * at compile-time via the `MP_WARRAY_NUM` define * at run-time by calling `mp_warray_init()` The internal structure can only be created once. If one wants to modify the maximum number of elements, the entire structure has to be free'd by calling `mp_warray_free()`. In case one wants to use this option with multiple threads, one shall use the `mp_warray_init()` function and pass appropriate locking functions. Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
1 parent 03de03d commit 5f689b7

20 files changed

+397
-11
lines changed

.github/workflows/main.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ jobs:
5252
# RSA superclass with tests (no sanitizer, but debug info)
5353
- { BUILDOPTIONS: '--with-cc=gcc-5 --with-m64 --cflags=-DLTM_NOTHING --cflags=-DSC_RSA_1_WITH_TESTS --limit-valgrind', SANITIZER: '', COMPILE_DEBUG: '1', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-5' }
5454

55+
# Build with small stack-size
56+
- { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' }
57+
- { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' }
58+
- { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
59+
- { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
60+
5561
# Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
5662
#- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
5763
#- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --limit-valgrind --make-option=tune'
@@ -153,7 +159,7 @@ jobs:
153159
matrix:
154160
os: [ ubuntu-18.04, ubuntu-20.04 ]
155161
build_type: [ '', -DCMAKE_BUILD_TYPE=Debug, -DCMAKE_BUILD_TYPE=Release, -DCMAKE_BUILD_TYPE=RelWithDebInfo, -DCMAKE_BUILD_TYPE=MinSizeRel ]
156-
cc: [ clang, gcc ]
162+
cc: [ clang-10, gcc ]
157163
config:
158164
# Static library build
159165
- { CMAKEOPTIONS: '-DBUILD_SHARED_LIBS=Off' }
@@ -164,7 +170,7 @@ jobs:
164170
- name: install dependencies
165171
run: |
166172
sudo apt-get update -qq
167-
sudo apt-get install -y cmake gcc clang llvm
173+
sudo apt-get install -y cmake gcc clang-10 llvm-10
168174
- name: build
169175
run: |
170176
mkdir build

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ if(COMPILE_LTO)
122122
if(COMPILER_SUPPORTS_LTO)
123123
set_property(TARGET ${PROJECT_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
124124
else()
125-
message(SEND_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
125+
message(FATAL_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
126126
endif()
127127
endif()
128128

demo/test.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,6 +2270,21 @@ static int test_mp_pack_unpack(void)
22702270
return EXIT_FAILURE;
22712271
}
22722272

2273+
2274+
#ifdef MP_TEST_LOCKING
2275+
#ifdef MP_NO_LOCKING
2276+
#error "Can't test locking when locking is disabled"
2277+
#endif
2278+
static mp_lock lock_ctx;
2279+
static int noop_lock_unlock(void *ctx)
2280+
{
2281+
EXPECT(ctx == &lock_ctx);
2282+
return 0;
2283+
LBL_ERR:
2284+
return -1;
2285+
}
2286+
#endif
2287+
22732288
#ifndef LTM_TEST_DYNAMIC
22742289
#define ONLY_PUBLIC_API_C
22752290
#endif
@@ -2340,14 +2355,22 @@ static int unit_tests(int argc, char **argv)
23402355
unsigned long i, ok, fail, nop;
23412356
uint64_t t;
23422357
int j;
2358+
#ifdef MP_TEST_LOCKING
2359+
lock_ctx.lock = noop_lock_unlock;
2360+
lock_ctx.unlock = noop_lock_unlock;
2361+
lock_ctx.ctx = &lock_ctx;
23432362

2363+
if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY)
2364+
return EXIT_FAILURE;
2365+
#endif
23442366
ok = fail = nop = 0;
23452367

23462368
t = (uint64_t)time(NULL);
23472369
printf("SEED: 0x%" PRIx64 "\n\n", t);
23482370
s_mp_rand_jenkins_init(t);
23492371
mp_rand_source(s_mp_rand_jenkins);
23502372

2373+
23512374
for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
23522375
if (argc > 1) {
23532376
for (j = 1; j < argc; ++j) {
@@ -2371,8 +2394,12 @@ static int unit_tests(int argc, char **argv)
23712394
}
23722395
fprintf(fail?stderr:stdout, "Tests OK/NOP/FAIL: %lu/%lu/%lu\n", ok, nop, fail);
23732396

2374-
if (fail != 0) return EXIT_FAILURE;
2375-
else return EXIT_SUCCESS;
2397+
EXPECT(mp_warray_free() != -2);
2398+
2399+
if (fail == 0)
2400+
return EXIT_SUCCESS;
2401+
LBL_ERR:
2402+
return EXIT_FAILURE;
23762403
}
23772404

23782405
int main(int argc, char **argv)

doc/bn.tex

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,20 @@ \subsubsection{Operand Size Related}
352352
\end{center}
353353
\end{small}
354354

355+
\subsection{Small-Stack option}
356+
\label{ch:SMALL_STACK_INTRO}
357+
The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in
358+
the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap.
359+
This comes with one problem, namely: formerly promised thread-safety isn't given anymore.
360+
Therefor if the Small-Stack option is enabled while doing multi threading, the provided locking
361+
mechanism shall be used.
362+
For some use cases it can be desired to use the Small-Stack option, but there are no threads and
363+
therefor we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}.
364+
365+
In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can
366+
be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic
367+
initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details.
368+
355369
\section{Purpose of LibTomMath}
356370
Unlike GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath
357371
was not written with bleeding edge performance in mind. First and foremost LibTomMath was written
@@ -428,7 +442,9 @@ \chapter{Getting Started with LibTomMath}
428442
\section{Building Programs}
429443
In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library
430444
file (typically
431-
libtommath.a). There is no library initialization required and the entire library is thread safe.
445+
libtommath.a). There is no library initialization required and the entire library is thread safe
446+
if it is used in its default configuration. Locking is recommended if the small-stack option
447+
is enabeled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API}
432448

433449
\section{Return Codes}
434450
There are five possible return codes a function may return.
@@ -813,6 +829,53 @@ \subsection{Adding additional digits}
813829
\end{alltt}
814830
\end{small}
815831

832+
\section{Small-Stack option}
833+
\label{ch:SMALL_STACK_API}
834+
835+
In case the \texttt{MP\_SMALL\_STACK\_SIZE} symbol is defined the following functions
836+
can be useful.
837+
838+
To initialize the internal structure the following function shall be called.
839+
840+
\index{mp\_warray\_init}
841+
\begin{alltt}
842+
mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
843+
\end{alltt}
844+
845+
The \texttt{mp\_lock} struct looks as follows and shall be used to protect the
846+
internal structure when using the library in a multi-threaded application.
847+
848+
\index{mp\_lock}
849+
\begin{alltt}
850+
typedef struct {
851+
int (*lock)(void *ctx);
852+
int (*unlock)(void *ctx);
853+
void *ctx;
854+
} mp_lock;
855+
\end{alltt}
856+
857+
The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp.
858+
after modifying the internal struct.
859+
The \texttt{mp\_lock.ctx} element will be passed to those functions.
860+
861+
To free the internally allocated memory the following function shall be called.
862+
863+
\index{mp\_warray\_free}
864+
\begin{alltt}
865+
int mp_warray_free(void);
866+
\end{alltt}
867+
868+
The memory allocated on the heap will never be auto-free'd as there's no good point in time where
869+
the library knows when it is fine to free those buffers. Therefor one has to free those buffers
870+
themself by calling \texttt{mp\_warray\_free()}.
871+
872+
873+
Those two API functions are always available, even if the \texttt{MP\_SMALL\_STACK\_SIZE} option
874+
has been disabled at compile time.
875+
In that case \texttt{mp\_warray\_init()} will return \texttt{MP\_ERR} and \texttt{mp\_warray\_free()}
876+
will return $-1$.
877+
878+
816879
\chapter{Basic Operations}
817880
\section{Copying}
818881

helper.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ sub update_dep
381381
foreach my $filename (glob '*mp_*.c') {
382382
my $content;
383383
my $cc = $ENV{'CC'} || 'gcc';
384-
$content = `$cc -E -x c -DLTM_ALL $filename`;
384+
$content = `$cc -E -x c -DLTM_ALL -DMP_SMALL_STACK_SIZE $filename`;
385385
$content =~ s/^# 1 "$filename".*?^# 2 "$filename"//ms;
386386

387387
# convert filename to upper case so we can use it as a define

mp_warray_free.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "tommath_private.h"
2+
#ifdef MP_WARRAY_FREE_C
3+
/* LibTomMath, multiple-precision integer library -- Tom St Denis */
4+
/* SPDX-License-Identifier: Unlicense */
5+
6+
/* static check that the multiplication won't overflow */
7+
MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) >= MP_WARRAY)
8+
9+
static int s_warray_free(void)
10+
{
11+
int ret = 0;
12+
size_t n;
13+
S_MP_WARRAY_LOCK();
14+
for (n = 0; n < s_mp_warray.allocated; ++n) {
15+
if (s_mp_warray.l_used[n].warray) {
16+
ret = -2;
17+
goto ERR_OUT;
18+
}
19+
}
20+
for (n = 0; n < s_mp_warray.allocated; ++n) {
21+
MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY);
22+
s_mp_warray.l_free[n].warray = NULL;
23+
}
24+
s_mp_warray_free(s_mp_warray.usable);
25+
ERR_OUT:
26+
S_MP_WARRAY_UNLOCK();
27+
return ret;
28+
}
29+
30+
int mp_warray_free(void)
31+
{
32+
if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_free();
33+
return -1;
34+
}
35+
36+
#endif

mp_warray_init.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include "tommath_private.h"
2+
#ifdef MP_WARRAY_INIT_C
3+
/* LibTomMath, multiple-precision integer library -- Tom St Denis */
4+
/* SPDX-License-Identifier: Unlicense */
5+
6+
static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
7+
{
8+
size_t n;
9+
if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) {
10+
return MP_VAL;
11+
}
12+
13+
if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) {
14+
if (lock->lock == NULL || lock->unlock == NULL)
15+
return MP_VAL;
16+
s_mp_warray.lock = *lock;
17+
s_mp_warray.locking_enabled = true;
18+
} else {
19+
s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock));
20+
}
21+
22+
s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free)));
23+
s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used)));
24+
if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) {
25+
s_mp_warray_free(n_alloc);
26+
return MP_MEM;
27+
}
28+
29+
if (preallocate) {
30+
for (n = 0; n < n_alloc; ++n) {
31+
s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
32+
if (s_mp_warray.l_free[n].warray == NULL) {
33+
while (n > 0) {
34+
n--;
35+
MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word));
36+
s_mp_warray.l_free[n].warray = NULL;
37+
}
38+
s_mp_warray_free(n_alloc);
39+
return MP_MEM;
40+
}
41+
}
42+
s_mp_warray.allocated = n_alloc;
43+
}
44+
45+
s_mp_warray.usable = n_alloc;
46+
return MP_OKAY;
47+
}
48+
49+
mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
50+
{
51+
if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock);
52+
return MP_ERR;
53+
}
54+
55+
#endif

s_mp_montgomery_reduce_comba.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
1515
{
1616
int ix, oldused;
1717
mp_err err;
18-
mp_word W[MP_WARRAY];
18+
mp_word MP_ALLOC_WARRAY(W);
19+
20+
MP_CHECK_WARRAY(W);
1921

2022
if (x->used > MP_WARRAY) {
23+
MP_FREE_WARRAY(W);
2124
return MP_VAL;
2225
}
2326

@@ -26,6 +29,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
2629

2730
/* grow a as required */
2831
if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
32+
MP_FREE_WARRAY(W);
2933
return err;
3034
}
3135

@@ -110,6 +114,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
110114

111115
mp_clamp(x);
112116

117+
MP_FREE_WARRAY(W);
113118
/* if A >= m then A = A - m */
114119
if (mp_cmp_mag(x, n) != MP_LT) {
115120
return s_mp_sub(x, n, x);

s_mp_mul_comba.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,14 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
2323
{
2424
int oldused, pa, ix;
2525
mp_err err;
26-
mp_digit W[MP_WARRAY];
26+
mp_digit MP_ALLOC_WARRAY(W);
2727
mp_word _W;
2828

29+
MP_CHECK_WARRAY(W);
30+
2931
/* grow the destination as required */
3032
if ((err = mp_grow(c, digs)) != MP_OKAY) {
33+
MP_FREE_WARRAY(W);
3134
return err;
3235
}
3336

@@ -73,6 +76,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
7376
s_mp_zero_digs(c->dp + c->used, oldused - c->used);
7477

7578
mp_clamp(c);
79+
MP_FREE_WARRAY(W);
7680
return MP_OKAY;
7781
}
7882
#endif

s_mp_mul_high_comba.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
1616
{
1717
int oldused, pa, ix;
1818
mp_err err;
19-
mp_digit W[MP_WARRAY];
19+
mp_digit MP_ALLOC_WARRAY(W);
2020
mp_word _W;
2121

22+
MP_CHECK_WARRAY(W);
23+
2224
/* grow the destination as required */
2325
pa = a->used + b->used;
2426
if ((err = mp_grow(c, pa)) != MP_OKAY) {
27+
MP_FREE_WARRAY(W);
2528
return err;
2629
}
2730

@@ -65,6 +68,7 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
6568
s_mp_zero_digs(c->dp + c->used, oldused - c->used);
6669

6770
mp_clamp(c);
71+
MP_FREE_WARRAY(W);
6872
return MP_OKAY;
6973
}
7074
#endif

0 commit comments

Comments
 (0)