Skip to content
6 changes: 6 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,12 @@ argparse
inline code when color output is enabled.
(Contributed by Savannah Ostrowski in :gh:`142390`.)

base64 & binascii
-----------------

* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
faster thanks to simple CPU pipelining optimizations.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Contributed by ...


calendar
--------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
related codec has been optimized for modern pipelined CPU architectures and
now performs 2-3x faster across all platforms.
155 changes: 133 additions & 22 deletions Modules/binascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ get_binascii_state(PyObject *module)
}


static const unsigned char table_a2b_base64[] = {
/* Align to 64 bytes to ensure table fits in a single L1 cache line */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The entire 256-bytes table will not fit in a single L1 cache line.

It may be worth to align anyway, but the comment is incorrect.

static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
Expand All @@ -101,9 +102,101 @@ static const unsigned char table_a2b_base64[] = {
/* Max binary chunk size; limited only by available memory */
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)

static const unsigned char table_b2a_base64[] =
/*
* Fast base64 encoding/decoding helpers.
*
* Process complete groups without loop-carried dependencies.
*/

/* Align to 64 bytes to ensure table fits in a single L1 cache line */
static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/* Encode 3 bytes into 4 base64 characters. */
static inline void
base64_encode_trio(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned int combined = ((unsigned int)in[0] << 16) |
((unsigned int)in[1] << 8) |
(unsigned int)in[2];
out[0] = table[(combined >> 18) & 0x3f];
out[1] = table[(combined >> 12) & 0x3f];
out[2] = table[(combined >> 6) & 0x3f];
out[3] = table[combined & 0x3f];
}

/* Encode multiple complete 3-byte groups.
* Returns the number of input bytes processed (always a multiple of 3).
*/
static inline Py_ssize_t
base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_trios = in_len / 3;
Py_ssize_t i;

for (i = 0; i < n_trios; i++) {
base64_encode_trio(in + i * 3, out + i * 4, table);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it faster than incrementing in and out by 3 and 4 correspondingly?

}

return n_trios * 3;
}

/* Decode 4 base64 characters into 3 bytes.
* Returns 1 on success, 0 if any character is invalid.
*/
static inline int
base64_decode_quad(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned char v0 = table[in[0]];
unsigned char v1 = table[in[1]];
unsigned char v2 = table[in[2]];
unsigned char v3 = table[in[3]];

if ((v0 | v1 | v2 | v3) & 0xc0) {
return 0;
}

out[0] = (v0 << 2) | (v1 >> 4);
out[1] = (v1 << 4) | (v2 >> 2);
out[2] = (v2 << 6) | v3;
return 1;
}

/* Decode multiple complete 4-character groups (no padding allowed).
* Returns the number of input characters processed.
* Stops at the first invalid character, padding, or incomplete group.
*/
static inline Py_ssize_t
base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_quads = in_len / 4;
Py_ssize_t i;

for (i = 0; i < n_quads; i++) {
const unsigned char *inp = in + i * 4;

/* Check for padding - exit fast path to handle it properly.
* Four independent comparisons lets the compiler choose the optimal
* approach; on modern pipelined CPUs this is faster than bitmask tricks
* like XOR+SUB+AND for zero-detection which have data dependencies.
*/
if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD ||
inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) {
break;
}
Comment on lines +182 to +190
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For each group we have two checks. One here, with comparing all bytes to BASE64_PAD, and other in base64_decode_quad(), (v0 | v1 | v2 | v3) & 0xc0. Even if the former is much faster than the latter, aren't two checks slower then just one check? For most groups they are false, we can only have a benefit for the last group. For large data the benefit is much smaller than the cost which is proportional to the size of the data.


if (!base64_decode_quad(inp, out + i * 3, table)) {
break;
}
}

return i * 4;
}


static const unsigned short crctab_hqx[256] = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
Expand Down Expand Up @@ -403,10 +496,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
goto error_end;
}

size_t i = 0; /* Current position in input */

/* Fast path: use optimized decoder for complete quads.
* This works for both strict and non-strict mode for valid input.
* The fast path stops at padding, invalid chars, or incomplete groups.
*/
if (ascii_len >= 4) {
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
bin_data, table_a2b_base64);
if (fast_chars > 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this condition needed?

i = (size_t)fast_chars;
bin_data += (fast_chars / 4) * 3;
}
}

/* Slow path: handle remaining input (padding, invalid chars, partial groups) */
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
for (size_t i = 0; i < ascii_len; i++) {
for (; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];

/* Check for pad sequences and ignore
Expand Down Expand Up @@ -533,9 +642,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
/*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
{
const unsigned char *bin_data;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t bin_len;
binascii_state *state;

Expand Down Expand Up @@ -566,26 +672,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
}
unsigned char *ascii_data = PyBytesWriter_GetData(writer);

for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
/* Shift the data into our buffer */
leftchar = (leftchar << 8) | *bin_data;
leftbits += 8;

/* See if there are 6-bit groups ready */
while ( leftbits >= 6 ) {
this_ch = (leftchar >> (leftbits-6)) & 0x3f;
leftbits -= 6;
*ascii_data++ = table_b2a_base64[this_ch];
}
}
if ( leftbits == 2 ) {
*ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
/* Use the optimized fast path for complete 3-byte groups */
Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
table_b2a_base64);
bin_data += fast_bytes;
ascii_data += (fast_bytes / 3) * 4;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder, would not it be more efficient to return the number of groups, so you can avoid division. Although it can be below a noise.

bin_len -= fast_bytes;

/* Handle remaining 0-2 bytes */
if (bin_len == 1) {
/* 1 byte remaining: produces 2 base64 chars + 2 padding */
unsigned int val = bin_data[0];
*ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
*ascii_data++ = BASE64_PAD;
*ascii_data++ = BASE64_PAD;
} else if ( leftbits == 4 ) {
*ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
}
else if (bin_len == 2) {
/* 2 bytes remaining: produces 3 base64 chars + 1 padding */
unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
*ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
*ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
*ascii_data++ = BASE64_PAD;
}

if (newline)
*ascii_data++ = '\n'; /* Append a courtesy newline */

Expand Down
Loading