Single-file lock-free SPSC byte ring for embedded C++.
for moving bytes between one producer thread and one consumer thread through a fixed buffer. the buffer is caller-provided and the cursors only advance on success.
The ring is a single nc_ring struct holding a base pointer, a capacity, and two
free-running cursors.
Warning
Defining NCRING_NO_STDLIB currently uses the fallback MemCpy implementation
which copies a single byte at a time. It absolutely kills performance and
I recommend if NCRING_NO_STDLIB is necessary that you provide a better
implementation for your architecture.
Copy nc_ring.h into your project.
In one C++ source file, define NCRING_IMPLEMENTATION before including the
header:
#define NCRING_IMPLEMENTATION
#include "nc_ring.h"All other files that need the API include the header without the define.
Tip
To confine all symbols to a single translation unit, NCRING_STATIC can be
defined alongside NCRING_IMPLEMENTATION.
The atomics use the GCC/Clang __atomic builtins; the header errors on any other
compiler.
The following macros can be defined before including the header to replace default dependencies:
| Macro | Default | Purpose |
|---|---|---|
NCRING_NO_STDLIB |
Undefined | Suppresses <stdint.h> / <string.h>; caller must provide u8, u16, u32, u64, i8, i16, i32, i64, b32, b8, f32, f64 typedefs |
NCRING_STATIC |
Undefined | With NCRING_IMPLEMENTATION, gives every symbol internal linkage (NCRING_DEF becomes static) |
NCRING_MEMCPY(d, s, n) |
memcpy |
Payload copy. This sits on the read and write hot path — point it at an optimised word-copy, not a byte loop (see Performance) |
NC_CPU_CACHE_LINE_SIZE is detected from the target (32 on ARM/RISC-V/x86-32/Xtensa,
64 on 64-bit ARM/RISC-V/x86-64) and sets the cursor padding; manually change this yourself to
override.
Bind the ring to a buffer whose size is a power of two.
u8 buffer[4096]; // power of two
nc_ring ring = nc_ring_init(buffer, sizeof(buffer));nc_ring_write copies the whole
record and returns its size, or copies nothing and returns 0 when there isn't
room.
// ring full — consumer hasn't caught up
while (!nc_ring_write(&ring, &record, sizeof(record)))
CPUPause();
nc_ring_write_struct(&ring, &record); // sizeof(*ptr) wrappernc_ring_read delivers the whole record and
returns its size, or returns 0 when fewer than Size bytes are available.
// ring empty — producer hasn't published
while (!nc_ring_read(&ring, &record, sizeof(record)))
CPUPause();
nc_ring_read_struct(&ring, &record); // sizeof(*ptr) wrapperTip
An implementation has been provided for CPUPause which should cover most common CPU architectures, but if yours is missing,
it is easily added manually in nc_ring.h.
Measured on a dual-core ESP32-WROOM (Xtensa LX6, 240 MHz), producer pinned to APP_CPU and consumer to PRO_CPU, one million records per run, NCRING_MEMCPY mapped to the
toolchain memcpy.
| Record size | Per record | End-to-end throughput |
|---|---|---|
| 8 B | 1.38 µs | ~5.6 MB/s |
| 512 B | 5.40 µs | ~92 MB/s |
| 2048 B | 19.8 µs | ~99 MB/s |
Push a few records, then drain them on the same thread.
struct rec {
u32 Seq;
u32 Val;
};
u8 buffer[64];
nc_ring ring = nc_ring_init(buffer, sizeof(buffer));
for (u32 i = 0; i < 4; ++i) {
rec r = {
i,
i * 10
};
nc_ring_write_struct(&ring, &r);
}
rec out;
while (nc_ring_read_struct(&ring, &out))
printf("seq=%u val=%u\n", out.Seq, out.Val);seq=0 val=0
seq=1 val=10
seq=2 val=20
seq=3 val=30a producer task on one core streams records, a consumer task on the other verifies them.
#define NCRING_IMPLEMENTATION
#include "nc_ring.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "freertos/semphr.h"
#define RING_BYTES 4096u
#define RECORDS 1000000u
struct rec { u32 Seq; u32 Check; };
static inline u32 scramble(u32 s) {
return (s * 2654435761u) ^ 0xA5A5A5A5u;
}
static u8 g_buf[RING_BYTES];
static nc_ring g_ring;
static SemaphoreHandle_t g_done;
static volatile u32 g_errors;
static void
producer(void*)
{
for (u32 seq = 0; seq < RECORDS; ++seq) {
rec r = {
seq,
scramble(seq)
};
while (!nc_ring_write_struct(&g_ring, &r))
CPUPause();
}
vTaskDelete(NULL);
}
static void
consumer(void*)
{
u32 errors = 0;
for (u32 expected = 0; expected < RECORDS; ++expected) {
rec r = {};
while (!nc_ring_read_struct(&g_ring, &r))
CPUPause();
if (r.Seq != expected || r.Check != scramble(expected))
++errors;
}
g_errors = errors;
xSemaphoreGive(g_done);
vTaskDelete(NULL);
}
extern "C"
void
app_main(void)
{
g_ring = nc_ring_init(g_buf, RING_BYTES);
g_done = xSemaphoreCreateBinary();
// Keep app_main above the workers while both are created, so neither
// preempts setup before the other exists, then block to hand them the cores.
vTaskPrioritySet(NULL, configMAX_PRIORITIES - 1);
xTaskCreatePinnedToCore(producer, "prod", 4096, NULL, 5, NULL, 1);
xTaskCreatePinnedToCore(consumer, "cons", 4096, NULL, 5, NULL, 0);
xSemaphoreTake(g_done, portMAX_DELAY);
printf("%s\n", g_errors ? "FAIL" : "PASS");
}PASS| Constraint | Value | Defined by |
|---|---|---|
| Buffer size | 4 GiB | u32 Capacity |
| Capacity | power of two | mask-based wrap + free-running cursors |
| Concurrency | 1 producer, 1 consumer | SPSC; lock-free has one writer per cursor |