77 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
88 *
99 * Copyright (C) 2012 Intel Corporation.
10+ * Copyright 2024 Google LLC
1011 *
1112 * Authors:
1213 * Wajdi Feghali <wajdi.k.feghali@intel.com>
4445 */
4546
4647##include "linkage.h"
47- ##include <asm/nospec-branch.h>
4848
4949#define ENDBR
5050
5151## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
5252
53- .macro LABEL prefix n
54- .L\prefix\n\():
55- .endm
56-
57- .macro JMPTBL_ENTRY i
58- .quad .Lcrc_\i
59- .endm
60-
6153# Define threshold below which buffers are considered "small" and routed to
6254# regular CRC code that does not interleave the CRC instructions.
6355#define SMALL_SIZE 200
6961.globl crc_pcl
7062crc_pcl:
7163###SYM_FUNC_START(crc_pcl)
72- #define bufp rdi
73- #define bufp_dw %edi
74- #define bufp_w %di
75- #define bufp_b %dil
76- #define bufptmp %rcx
77- #define block_0 %rcx
78- #define block_1 %rdx
79- #define block_2 %r11
80- #define len %esi
81- #define crc_init_arg %edx
82- #define tmp %rbx
83- #define crc_init %r8d
84- #define crc_init_q %r8
85- #define crc1 %r9
86- #define crc2 %r10
87-
88- pushq %rbx
89- pushq %rdi
90- pushq %rsi
64+ #define bufp %rdi
65+ #define bufp_d %edi
66+ #define len %esi
67+ #define crc_init %edx
68+ #define crc_init_q %rdx
69+ #define n_misaligned %ecx /* overlaps chunk_bytes! */
70+ #define n_misaligned_q %rcx
71+ #define chunk_bytes %ecx /* overlaps n_misaligned! */
72+ #define chunk_bytes_q %rcx
73+ #define crc1 %r8
74+ #define crc2 %r9
9175
92- ## Move crc_init for Linux to a different
93- mov crc_init_arg, crc_init
94-
95- mov %bufp, bufptmp # rdi = *buf
9676 cmp $SMALL_SIZE, len
9777 jb .Lsmall
9878
9979 ################################################################
10080 ## 1) ALIGN:
10181 ################################################################
102- neg %bufp
103- and $7 , %bufp # calculate the unalignment amount of
82+ mov bufp_d, n_misaligned
83+ neg n_misaligned
84+ and $7 , n_misaligned # calculate the misalignment amount of
10485 # the address
105- je .Lproc_block # Skip if aligned
86+ je .Laligned # Skip if aligned
10687
88+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
89+ # the remaining data to an 8-byte boundary.
10790.Ldo_align:
108- #### Calculate CRC of unaligned bytes of the buffer (if any)
109- movq (bufptmp), tmp # load a quadward from the buffer
110- add %bufp, bufptmp # align buffer pointer for quadword
111- # processing
112- sub bufp_dw, len # update buffer length
91+ movq (bufp), %rax
92+ add n_misaligned_q, bufp
93+ sub n_misaligned, len
11394.Lalign_loop:
114- crc32b %bl , crc_init # compute crc32 of 1-byte
115- shr $8 , tmp # get next byte
116- dec %bufp
95+ crc32b %al , crc_init # compute crc32 of 1-byte
96+ shr $8 , %rax # get next byte
97+ dec n_misaligned
11798 jne .Lalign_loop
118-
119- .Lproc_block:
99+ .Laligned:
120100
121101 ################################################################
122- ## 2) PROCESS BLOCKS :
102+ ## 2) PROCESS BLOCK :
123103 ################################################################
124104
125- ## compute num of bytes to be processed
126-
127105 cmp $128*24 , len
128106 jae .Lfull_block
129107
130- .Lcontinue_block:
131- ## len < 128*24
132- movq $2731 , %rax # 2731 = ceil(2^16 / 24)
133- mul len
134- shrq $16 , %rax
135-
136- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
137-
138- ## process rax 24-byte chunks (128 >= rax >= 0)
139-
140- ## compute end address of each block
141- ## block 0 (base addr + RAX * 8)
142- ## block 1 (base addr + RAX * 16)
143- ## block 2 (base addr + RAX * 24)
144- lea (bufptmp, %rax , 8 ), block_0
145- lea (block_0, %rax , 8 ), block_1
146- lea (block_1, %rax , 8 ), block_2
147-
148- xor crc1, crc1
149- xor crc2, crc2
150-
151- ## branch into array
152- leaq jump_table(%rip ), %bufp
153- mov (%bufp,%rax ,8 ), %bufp
154- ## JMP_NOSPEC
155- JMP *%bufp
156- ## JMP_NOSPEC
108+ .Lpartial_block:
109+ # Compute floor(len / 24) to get num qwords to process from each lane.
110+ imul $2731 , len, %eax # 2731 = ceil(2^16 / 24)
111+ shr $16 , %eax
112+ jmp .Lcrc_3lanes
157113
158- ################################################################
159- ## 2a) PROCESS FULL BLOCKS:
160- ################################################################
161114.Lfull_block:
162- movl $128 ,%eax
163- lea 128*8*2 (block_0), block_1
164- lea 128*8*3 (block_0), block_2
165- add $128*8*1 , block_0
166-
167- xor crc1,crc1
168- xor crc2,crc2
169-
170- # Fall through into top of crc array (crc_128)
115+ # Processing 128 qwords from each lane.
116+ mov $128 , %eax
171117
172118 ################################################################
173- ## 3) CRC Array :
119+ ## 3) CRC each of three lanes :
174120 ################################################################
175121
176- i =128
177- .rept 128 -1
178- .altmacro
179- LABEL crc_ %i
180- .noaltmacro
181- ENDBR
182- crc32q -i*8 (block_0), crc_init_q
183- crc32q -i*8 (block_1), crc1
184- crc32q -i*8 (block_2), crc2
185- i = (i-1 )
186- .endr
187-
188- .altmacro
189- LABEL crc_ %i
190- .noaltmacro
191- ENDBR
192- crc32q -i*8 (block_0), crc_init_q
193- crc32q -i*8 (block_1), crc1
194- # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
122+ .Lcrc_3lanes:
123+ xor crc1,crc1
124+ xor crc2,crc2
125+ mov %eax , chunk_bytes
126+ shl $3 , chunk_bytes # num bytes to process from each lane
127+ sub $5 , %eax # 4 for 4x_loop, 1 for special last iter
128+ jl .Lcrc_3lanes_4x_done
129+
130+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
131+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
132+ .Lcrc_3lanes_4x_loop:
133+ crc32q (bufp), crc_init_q
134+ crc32q (bufp,chunk_bytes_q), crc1
135+ crc32q (bufp,chunk_bytes_q,2 ), crc2
136+ crc32q 8 (bufp), crc_init_q
137+ crc32q 8 (bufp,chunk_bytes_q), crc1
138+ crc32q 8 (bufp,chunk_bytes_q,2 ), crc2
139+ crc32q 16 (bufp), crc_init_q
140+ crc32q 16 (bufp,chunk_bytes_q), crc1
141+ crc32q 16 (bufp,chunk_bytes_q,2 ), crc2
142+ crc32q 24 (bufp), crc_init_q
143+ crc32q 24 (bufp,chunk_bytes_q), crc1
144+ crc32q 24 (bufp,chunk_bytes_q,2 ), crc2
145+ add $32 , bufp
146+ sub $4 , %eax
147+ jge .Lcrc_3lanes_4x_loop
148+
149+ .Lcrc_3lanes_4x_done:
150+ add $4 , %eax
151+ jz .Lcrc_3lanes_last_qword
152+
153+ .Lcrc_3lanes_1x_loop:
154+ crc32q (bufp), crc_init_q
155+ crc32q (bufp,chunk_bytes_q), crc1
156+ crc32q (bufp,chunk_bytes_q,2 ), crc2
157+ add $8 , bufp
158+ dec %eax
159+ jnz .Lcrc_3lanes_1x_loop
195160
196- mov block_2, block_0
161+ .Lcrc_3lanes_last_qword:
162+ crc32q (bufp), crc_init_q
163+ crc32q (bufp,chunk_bytes_q), crc1
164+ # SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
197165
198166 ################################################################
199167 ## 4) Combine three results:
200168 ################################################################
201169
202- lea (K_table-8 )(%rip ), %bufp # first entry is for idx 1
203- shlq $3 , %rax # rax *= 8
204- pmovzxdq (%bufp,%rax ), %xmm0 # 2 consts: K1:K2
205- leal (%eax ,%eax ,2 ), %eax # rax *= 3 (total *24)
206- sub %eax , len # len -= rax*24
170+ lea (K_table-8 )(%rip ), %rax # first entry is for idx 1
171+ pmovzxdq (%rax ,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
172+ lea (chunk_bytes,chunk_bytes,2 ), %eax # chunk_bytes * 3
173+ sub %eax , len # len -= chunk_bytes * 3
207174
208175 movq crc_init_q, %xmm1 # CRC for block 1
209176 pclmulqdq $0x00 , %xmm0 , %xmm1 # Multiply by K2
@@ -213,20 +180,19 @@ LABEL crc_ %i
213180
214181 pxor %xmm2 ,%xmm1
215182 movq %xmm1 , %rax
216- xor -i* 8 (block_2 ), %rax
183+ xor (bufp,chunk_bytes_q, 2 ), %rax
217184 mov crc2, crc_init_q
218185 crc32 %rax , crc_init_q
186+ lea 8 (bufp,chunk_bytes_q,2 ), bufp
219187
220188 ################################################################
221- ## 5) Check for end :
189+ ## 5) If more blocks remain, goto (2) :
222190 ################################################################
223191
224- LABEL crc_ 0
225- ENDBR
226192 cmp $128*24 , len
227- jae .Lfull_block
193+ jae .Lfull_block
228194 cmp $SMALL_SIZE, len
229- jae .Lcontinue_block
195+ jae .Lpartial_block
230196
231197 #######################################################################
232198 ## 6) Process any remainder without interleaving:
@@ -238,49 +204,32 @@ LABEL crc_ 0
238204 shr $3 , %eax
239205 jz .Ldo_dword
240206.Ldo_qwords:
241- crc32q (bufptmp ), crc_init_q
242- add $8 , bufptmp
207+ crc32q (bufp ), crc_init_q
208+ add $8 , bufp
243209 dec %eax
244210 jnz .Ldo_qwords
245211.Ldo_dword:
246212 test $4 , len
247213 jz .Ldo_word
248- crc32l (bufptmp ), crc_init
249- add $4 , bufptmp
214+ crc32l (bufp ), crc_init
215+ add $4 , bufp
250216.Ldo_word:
251217 test $2 , len
252218 jz .Ldo_byte
253- crc32w (bufptmp ), crc_init
254- add $2 , bufptmp
219+ crc32w (bufp ), crc_init
220+ add $2 , bufp
255221.Ldo_byte:
256222 test $1 , len
257223 jz .Ldone
258- crc32b (bufptmp ), crc_init
224+ crc32b (bufp ), crc_init
259225.Ldone:
260226 mov crc_init, %eax
261- popq %rsi
262- popq %rdi
263- popq %rbx
264227 RET
265228###SYM_FUNC_END(crc_pcl)
266229.size crc_pcl, .-crc_pcl
267230###SYM_FUNC_END(crc_pcl)
268231
269- ################################################################
270- ## jump table Table is 129 entries x 2 bytes each
271- ################################################################
272232.data
273- .align 4
274- jump_table:
275- i =0
276- .rept 129
277- .altmacro
278- JMPTBL_ENTRY %i
279- .noaltmacro
280- i = i+1
281- .endr
282-
283-
284233 ################################################################
285234 ## PCLMULQDQ tables
286235 ## Table is 128 entries x 2 words (8 bytes) each
0 commit comments