Skip to content

Commit ba231cb

Browse files
ebiggerskdave
authored andcommitted
btrfs-progs: crypto: x86/crc32c - eliminate jump table and excessive unrolling
(Linux kernel commit 84dd048cf89557e1e4badd20c9522f7aaa0275fe). crc32c-pcl-intel-asm_64.S has a loop with 1 to 127 iterations fully unrolled and uses a jump table to jump into the correct location. This optimization is misguided, as it bloats the binary code size and introduces an indirect call. x86_64 CPUs can predict loops well, so it is fine to just use a loop instead. Loop bookkeeping instructions can compete with the crc instructions for the ALUs, but this is easily mitigated by unrolling the loop by a smaller amount, such as 4 times. Therefore, re-roll the loop and make related tweaks to the code. This reduces the binary code size of crc_pclmul() from 4546 bytes to 418 bytes, a 91% reduction. In general it also makes the code faster, with some large improvements seen when retpoline is enabled. More detailed performance results are shown below. They are given as percent improvement in throughput (negative means regressed) for CPU microarchitecture vs. input length in bytes. E.g. an improvement from 40 GB/s to 50 GB/s would be listed as 25%. Table 1: Results with retpoline enabled (the default): | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 35.0% | 20.7% | 17.8% | 9.7% | -0.2% | 4.4% | Intel Emerald Rapids | 66.8% | 45.2% | 36.3% | 19.3% | 0.0% | 5.4% | AMD Zen 2 | 29.5% | 17.2% | 13.5% | 8.6% | -0.5% | 2.8% | Table 2: Results with retpoline disabled: | 512 | 833 | 1024 | 2000 | 3173 | 4096 | ---------------------+-------+-------+-------+------ +-------+-------+ Intel Haswell | 3.3% | 4.8% | 4.5% | 0.9% | -2.9% | 0.3% | Intel Emerald Rapids | 7.5% | 6.4% | 5.2% | 2.3% | -0.0% | 0.6% | AMD Zen 2 | 11.8% | 1.4% | 0.2% | 1.3% | -0.9% | -0.2% | Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 311c34a commit ba231cb

File tree

1 file changed

+92
-143
lines changed

1 file changed

+92
-143
lines changed

crypto/crc32c-pcl-intel-asm_64.S

Lines changed: 92 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
88
*
99
* Copyright (C) 2012 Intel Corporation.
10+
* Copyright 2024 Google LLC
1011
*
1112
* Authors:
1213
* Wajdi Feghali <wajdi.k.feghali@intel.com>
@@ -44,20 +45,11 @@
4445
*/
4546

4647
##include "linkage.h"
47-
##include <asm/nospec-branch.h>
4848

4949
#define ENDBR
5050

5151
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
5252

53-
.macro LABEL prefix n
54-
.L\prefix\n\():
55-
.endm
56-
57-
.macro JMPTBL_ENTRY i
58-
.quad .Lcrc_\i
59-
.endm
60-
6153
# Define threshold below which buffers are considered "small" and routed to
6254
# regular CRC code that does not interleave the CRC instructions.
6355
#define SMALL_SIZE 200
@@ -69,141 +61,116 @@
6961
.globl crc_pcl
7062
crc_pcl:
7163
###SYM_FUNC_START(crc_pcl)
72-
#define bufp rdi
73-
#define bufp_dw %edi
74-
#define bufp_w %di
75-
#define bufp_b %dil
76-
#define bufptmp %rcx
77-
#define block_0 %rcx
78-
#define block_1 %rdx
79-
#define block_2 %r11
80-
#define len %esi
81-
#define crc_init_arg %edx
82-
#define tmp %rbx
83-
#define crc_init %r8d
84-
#define crc_init_q %r8
85-
#define crc1 %r9
86-
#define crc2 %r10
87-
88-
pushq %rbx
89-
pushq %rdi
90-
pushq %rsi
64+
#define bufp %rdi
65+
#define bufp_d %edi
66+
#define len %esi
67+
#define crc_init %edx
68+
#define crc_init_q %rdx
69+
#define n_misaligned %ecx /* overlaps chunk_bytes! */
70+
#define n_misaligned_q %rcx
71+
#define chunk_bytes %ecx /* overlaps n_misaligned! */
72+
#define chunk_bytes_q %rcx
73+
#define crc1 %r8
74+
#define crc2 %r9
9175

92-
## Move crc_init for Linux to a different
93-
mov crc_init_arg, crc_init
94-
95-
mov %bufp, bufptmp # rdi = *buf
9676
cmp $SMALL_SIZE, len
9777
jb .Lsmall
9878

9979
################################################################
10080
## 1) ALIGN:
10181
################################################################
102-
neg %bufp
103-
and $7, %bufp # calculate the unalignment amount of
82+
mov bufp_d, n_misaligned
83+
neg n_misaligned
84+
and $7, n_misaligned # calculate the misalignment amount of
10485
# the address
105-
je .Lproc_block # Skip if aligned
86+
je .Laligned # Skip if aligned
10687

88+
# Process 1 <= n_misaligned <= 7 bytes individually in order to align
89+
# the remaining data to an 8-byte boundary.
10790
.Ldo_align:
108-
#### Calculate CRC of unaligned bytes of the buffer (if any)
109-
movq (bufptmp), tmp # load a quadward from the buffer
110-
add %bufp, bufptmp # align buffer pointer for quadword
111-
# processing
112-
sub bufp_dw, len # update buffer length
91+
movq (bufp), %rax
92+
add n_misaligned_q, bufp
93+
sub n_misaligned, len
11394
.Lalign_loop:
114-
crc32b %bl, crc_init # compute crc32 of 1-byte
115-
shr $8, tmp # get next byte
116-
dec %bufp
95+
crc32b %al, crc_init # compute crc32 of 1-byte
96+
shr $8, %rax # get next byte
97+
dec n_misaligned
11798
jne .Lalign_loop
118-
119-
.Lproc_block:
99+
.Laligned:
120100

121101
################################################################
122-
## 2) PROCESS BLOCKS:
102+
## 2) PROCESS BLOCK:
123103
################################################################
124104

125-
## compute num of bytes to be processed
126-
127105
cmp $128*24, len
128106
jae .Lfull_block
129107

130-
.Lcontinue_block:
131-
## len < 128*24
132-
movq $2731, %rax # 2731 = ceil(2^16 / 24)
133-
mul len
134-
shrq $16, %rax
135-
136-
## eax contains floor(bytes / 24) = num 24-byte chunks to do
137-
138-
## process rax 24-byte chunks (128 >= rax >= 0)
139-
140-
## compute end address of each block
141-
## block 0 (base addr + RAX * 8)
142-
## block 1 (base addr + RAX * 16)
143-
## block 2 (base addr + RAX * 24)
144-
lea (bufptmp, %rax, 8), block_0
145-
lea (block_0, %rax, 8), block_1
146-
lea (block_1, %rax, 8), block_2
147-
148-
xor crc1, crc1
149-
xor crc2, crc2
150-
151-
## branch into array
152-
leaq jump_table(%rip), %bufp
153-
mov (%bufp,%rax,8), %bufp
154-
## JMP_NOSPEC
155-
JMP *%bufp
156-
## JMP_NOSPEC
108+
.Lpartial_block:
109+
# Compute floor(len / 24) to get num qwords to process from each lane.
110+
imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
111+
shr $16, %eax
112+
jmp .Lcrc_3lanes
157113

158-
################################################################
159-
## 2a) PROCESS FULL BLOCKS:
160-
################################################################
161114
.Lfull_block:
162-
movl $128,%eax
163-
lea 128*8*2(block_0), block_1
164-
lea 128*8*3(block_0), block_2
165-
add $128*8*1, block_0
166-
167-
xor crc1,crc1
168-
xor crc2,crc2
169-
170-
# Fall through into top of crc array (crc_128)
115+
# Processing 128 qwords from each lane.
116+
mov $128, %eax
171117

172118
################################################################
173-
## 3) CRC Array:
119+
## 3) CRC each of three lanes:
174120
################################################################
175121

176-
i=128
177-
.rept 128-1
178-
.altmacro
179-
LABEL crc_ %i
180-
.noaltmacro
181-
ENDBR
182-
crc32q -i*8(block_0), crc_init_q
183-
crc32q -i*8(block_1), crc1
184-
crc32q -i*8(block_2), crc2
185-
i=(i-1)
186-
.endr
187-
188-
.altmacro
189-
LABEL crc_ %i
190-
.noaltmacro
191-
ENDBR
192-
crc32q -i*8(block_0), crc_init_q
193-
crc32q -i*8(block_1), crc1
194-
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
122+
.Lcrc_3lanes:
123+
xor crc1,crc1
124+
xor crc2,crc2
125+
mov %eax, chunk_bytes
126+
shl $3, chunk_bytes # num bytes to process from each lane
127+
sub $5, %eax # 4 for 4x_loop, 1 for special last iter
128+
jl .Lcrc_3lanes_4x_done
129+
130+
# Unroll the loop by a factor of 4 to reduce the overhead of the loop
131+
# bookkeeping instructions, which can compete with crc32q for the ALUs.
132+
.Lcrc_3lanes_4x_loop:
133+
crc32q (bufp), crc_init_q
134+
crc32q (bufp,chunk_bytes_q), crc1
135+
crc32q (bufp,chunk_bytes_q,2), crc2
136+
crc32q 8(bufp), crc_init_q
137+
crc32q 8(bufp,chunk_bytes_q), crc1
138+
crc32q 8(bufp,chunk_bytes_q,2), crc2
139+
crc32q 16(bufp), crc_init_q
140+
crc32q 16(bufp,chunk_bytes_q), crc1
141+
crc32q 16(bufp,chunk_bytes_q,2), crc2
142+
crc32q 24(bufp), crc_init_q
143+
crc32q 24(bufp,chunk_bytes_q), crc1
144+
crc32q 24(bufp,chunk_bytes_q,2), crc2
145+
add $32, bufp
146+
sub $4, %eax
147+
jge .Lcrc_3lanes_4x_loop
148+
149+
.Lcrc_3lanes_4x_done:
150+
add $4, %eax
151+
jz .Lcrc_3lanes_last_qword
152+
153+
.Lcrc_3lanes_1x_loop:
154+
crc32q (bufp), crc_init_q
155+
crc32q (bufp,chunk_bytes_q), crc1
156+
crc32q (bufp,chunk_bytes_q,2), crc2
157+
add $8, bufp
158+
dec %eax
159+
jnz .Lcrc_3lanes_1x_loop
195160

196-
mov block_2, block_0
161+
.Lcrc_3lanes_last_qword:
162+
crc32q (bufp), crc_init_q
163+
crc32q (bufp,chunk_bytes_q), crc1
164+
# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
197165

198166
################################################################
199167
## 4) Combine three results:
200168
################################################################
201169

202-
lea (K_table-8)(%rip), %bufp # first entry is for idx 1
203-
shlq $3, %rax # rax *= 8
204-
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
205-
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
206-
sub %eax, len # len -= rax*24
170+
lea (K_table-8)(%rip), %rax # first entry is for idx 1
171+
pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
172+
lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
173+
sub %eax, len # len -= chunk_bytes * 3
207174

208175
movq crc_init_q, %xmm1 # CRC for block 1
209176
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
@@ -213,20 +180,19 @@ LABEL crc_ %i
213180

214181
pxor %xmm2,%xmm1
215182
movq %xmm1, %rax
216-
xor -i*8(block_2), %rax
183+
xor (bufp,chunk_bytes_q,2), %rax
217184
mov crc2, crc_init_q
218185
crc32 %rax, crc_init_q
186+
lea 8(bufp,chunk_bytes_q,2), bufp
219187

220188
################################################################
221-
## 5) Check for end:
189+
## 5) If more blocks remain, goto (2):
222190
################################################################
223191

224-
LABEL crc_ 0
225-
ENDBR
226192
cmp $128*24, len
227-
jae .Lfull_block
193+
jae .Lfull_block
228194
cmp $SMALL_SIZE, len
229-
jae .Lcontinue_block
195+
jae .Lpartial_block
230196

231197
#######################################################################
232198
## 6) Process any remainder without interleaving:
@@ -238,49 +204,32 @@ LABEL crc_ 0
238204
shr $3, %eax
239205
jz .Ldo_dword
240206
.Ldo_qwords:
241-
crc32q (bufptmp), crc_init_q
242-
add $8, bufptmp
207+
crc32q (bufp), crc_init_q
208+
add $8, bufp
243209
dec %eax
244210
jnz .Ldo_qwords
245211
.Ldo_dword:
246212
test $4, len
247213
jz .Ldo_word
248-
crc32l (bufptmp), crc_init
249-
add $4, bufptmp
214+
crc32l (bufp), crc_init
215+
add $4, bufp
250216
.Ldo_word:
251217
test $2, len
252218
jz .Ldo_byte
253-
crc32w (bufptmp), crc_init
254-
add $2, bufptmp
219+
crc32w (bufp), crc_init
220+
add $2, bufp
255221
.Ldo_byte:
256222
test $1, len
257223
jz .Ldone
258-
crc32b (bufptmp), crc_init
224+
crc32b (bufp), crc_init
259225
.Ldone:
260226
mov crc_init, %eax
261-
popq %rsi
262-
popq %rdi
263-
popq %rbx
264227
RET
265228
###SYM_FUNC_END(crc_pcl)
266229
.size crc_pcl, .-crc_pcl
267230
###SYM_FUNC_END(crc_pcl)
268231

269-
################################################################
270-
## jump table Table is 129 entries x 2 bytes each
271-
################################################################
272232
.data
273-
.align 4
274-
jump_table:
275-
i=0
276-
.rept 129
277-
.altmacro
278-
JMPTBL_ENTRY %i
279-
.noaltmacro
280-
i=i+1
281-
.endr
282-
283-
284233
################################################################
285234
## PCLMULQDQ tables
286235
## Table is 128 entries x 2 words (8 bytes) each

0 commit comments

Comments
 (0)