8888 "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
8989 #nn"00:\n\t"
9090
91+ /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
9192#define INNER_KERNELm8 (nn ) \
92- "cmpq $8 ,%2;jb "#nn"001f;"\
93+ "movq %3,%10; cmpq $16 ,%2;jb "#nn"001f;"\
9394 #nn"008:\n\t"\
9495 INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
9596 INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
97+ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
9698 INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
9799 INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
98- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
99- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
100- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
101- INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
102- "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
100+ "prefetcht1 (%11); addq $16,%11;"\
101+ "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
102+ "movq %3,%10;"\
103103 #nn"001:\n\t"\
104104 "cmpq $1,%2;jb "#nn"000f;"\
105+ "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
105106 INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
106107 "decq %2;jmp "#nn"001b;"\
107108 ""#nn"000:\n\t"
158159
159160#define INNER_STORE_m1n8 (c1 ,disp ) \
160161 "kxnorw %%k1,%%k1,%%k1;"\
161- "vgatherqpd "#disp"(%3 ,%%zmm6,1), %%zmm7 %{%%k1%};"\
162+ "vgatherqpd "#disp"(%10 ,%%zmm6,1), %%zmm7 %{%%k1%};"\
162163 "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
163164 "kxnorw %%k1,%%k1,%%k1;"\
164- "vscatterqpd "#c1", "#disp"(%3 ,%%zmm6,1) %{%%k1%};"
165+ "vscatterqpd "#c1", "#disp"(%10 ,%%zmm6,1) %{%%k1%};"
165166
166167#define INNER_SAVE_m1n8 \
168+ "movq %3,%10;"\
167169 INNER_SETINDEX\
168170 INNER_STORE_m1n8(%%zmm8,0)
169171
170172#define INNER_SAVE_m1n16 \
171173 INNER_SAVE_m1n8\
172- "leaq (%3 ,%4,8),%3 ;"\
174+ "leaq (%10 ,%4,8),%10 ;"\
173175 INNER_STORE_m1n8(%%zmm9,0)
174176
175177#define INNER_SAVE_m1n24 \
176178 INNER_SAVE_m1n16\
177- "leaq (%3 ,%4,8),%3 ;"\
179+ "leaq (%10 ,%4,8),%10 ;"\
178180 INNER_STORE_m1n8(%%zmm10,0)
179181
180182#define INNER_SAVE_m2n8 \
183+ "movq %3,%10;"\
181184 INNER_SETINDEX\
182185 INNER_STORE_m1n8(%%zmm8,0)\
183186 INNER_STORE_m1n8(%%zmm9,8)
184187
185188#define INNER_SAVE_m2n16 \
189+ "movq %3,%10;"\
186190 INNER_SETINDEX\
187191 INNER_STORE_m1n8(%%zmm8,0)\
188192 INNER_STORE_m1n8(%%zmm10,8)\
189- "leaq (%3 ,%4,8),%3 ;"\
193+ "leaq (%10 ,%4,8),%10 ;"\
190194 INNER_STORE_m1n8(%%zmm9,0)\
191195 INNER_STORE_m1n8(%%zmm11,8)
196+
192197#define INNER_SAVE_m2n24 \
198+ "movq %3,%10;"\
193199 INNER_SETINDEX\
194200 INNER_STORE_m1n8(%%zmm8,0)\
195201 INNER_STORE_m1n8(%%zmm11,8)\
196- "leaq (%3 ,%4,8),%3 ;"\
202+ "leaq (%10 ,%4,8),%10 ;"\
197203 INNER_STORE_m1n8(%%zmm9,0)\
198204 INNER_STORE_m1n8(%%zmm12,8)\
199- "leaq (%3 ,%4,8),%3 ;"\
205+ "leaq (%10 ,%4,8),%10 ;"\
200206 INNER_STORE_m1n8(%%zmm10,0)\
201207 INNER_STORE_m1n8(%%zmm13,8)
202- #define INNER_PREF_8x8 \
203- "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
204- "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
205- "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
206- "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
207- "subq %4,%3; subq %4,%3; subq %4,%3;"
208+
208209#define INNER_TRANS_4x8 (c1 ,c2 ,c3 ,c4 ) \
209210 "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
210211 "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
211212 "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
212213 "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
213214 "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
214215 "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
216+
215217#define INNER_TRANS_8x8 (c1 ,c2 ,c3 ,c4 ,c5 ,c6 ,c7 ,c8 ) \
216218 INNER_TRANS_4x8(c1,c2,c3,c4)\
217219 INNER_TRANS_4x8(c5,c6,c7,c8)\
223225 "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
224226 "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
225227 "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
228+
226229//%7 for k01(input) only when m=4
227230#define INNER_STORE_4x8 (c1 ,c2 ,c3 ,c4 ) \
228- "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
229- "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
230- "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
231- "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
232- "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
233- "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
234- "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
235- "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
236- "leaq (%3,%4,4),%3;"
231+ "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
232+ "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
233+ "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
234+ "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
235+ "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
236+ "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
237+ "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
238+ "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
239+ "leaq (%10,%4,4),%10;"
240+
237241#define INNER_STORE_8x8 (c1 ,c2 ,c3 ,c4 ,c5 ,c6 ,c7 ,c8 ) \
238- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
239- "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
240- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
241- "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
242- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
243- "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
244- "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
245- "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
242+ "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
243+ "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
244+ "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
245+ "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
246+
246247#define INNER_SAVE_m4n8 \
248+ "movq %3,%10;"\
247249 INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
248250 INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
251+
249252#define INNER_SAVE_m4n16 \
253+ "movq %3,%10;"\
250254 INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
251255 INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
252256 INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
253257 INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
258+
254259#define INNER_SAVE_m4n24 \
260+ "movq %3,%10;"\
255261 INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
256262 INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
257263 INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
258264 INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
259265 INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
260266 INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
267+
261268#define INNER_SAVE_m8n8 \
262- INNER_PREF_8x8 \
269+ "movq %3,%10;" \
263270 INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
264271 INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
272+
265273#define INNER_SAVE_m8n16 \
266- INNER_PREF_8x8 \
274+ "movq %3,%10;" \
267275 INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
268276 INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
269- INNER_PREF_8x8\
270277 INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
271278 INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
279+
272280#define INNER_SAVE_m8n24 \
273- INNER_PREF_8x8 \
281+ "movq %3,%10;" \
274282 INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
275283 INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
276- INNER_PREF_8x8\
277284 INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
278285 INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
279- INNER_PREF_8x8\
280286 INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
281287 INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
282288
283289#define COMPUTE_n8 {\
290+ b_pref = packed_b_pointer + 8 * K;\
284291 __asm__ __volatile__(\
285292 "vbroadcastsd (%9),%%zmm3;"\
286293 "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
290297 INNER_KERNELm8(8)\
291298 INNER_SAVE_m8n8\
292299 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
293- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $64,%3;"\
300+ "addq $64,%3;"\
294301 "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
295302 "42222:\n\t"\
296303 "cmpq $4,%8; jb 42223f;"\
297304 INNER_INIT_m4n8\
298305 INNER_KERNELm4(8)\
299306 INNER_SAVE_m4n8\
300307 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
301- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $32,%3;"\
308+ "addq $32,%3;"\
302309 "subq $4,%8;"\
303310 "42223:\n\t"\
304311 "cmpq $2,%8; jb 42224f;"\
318325 "42225:\n\t"\
319326 "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
320327 "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
321- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
328+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
329+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
322330 ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
323331 a_block_pointer -= M * K;\
324332}
325333#define COMPUTE_n16 {\
334+ b_pref = packed_b_pointer + 16 * K;\
326335 __asm__ __volatile__(\
327336 "vbroadcastsd (%9),%%zmm3;"\
328337 "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
332341 INNER_KERNELm8(16)\
333342 INNER_SAVE_m8n16\
334343 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
335- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $64,%3;"\
344+ "addq $64,%3;"\
336345 "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
337346 "32222:\n\t"\
338347 "cmpq $4,%8; jb 32223f;"\
339348 INNER_INIT_m4n16\
340349 INNER_KERNELm4(16)\
341350 INNER_SAVE_m4n16\
342351 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
343- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $32,%3;"\
352+ "addq $32,%3;"\
344353 "subq $4,%8;"\
345354 "32223:\n\t"\
346355 "cmpq $2,%8; jb 32224f;"\
347356 INNER_INIT_m2n16\
348357 INNER_KERNELm2(16)\
349358 INNER_SAVE_m2n16\
350359 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
351- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $16,%3;"\
360+ "addq $16,%3;"\
352361 "subq $2,%8;"\
353362 "32224:\n\t"\
354363 "cmpq $1,%8; jb 32225f;"\
355364 INNER_INIT_m1n16\
356365 INNER_KERNELm1(16)\
357366 INNER_SAVE_m1n16\
358367 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
359- "shlq $3,%4;subq %4,%3;shrq $3,%4; addq $8,%3;"\
368+ "addq $8,%3;"\
360369 "32225:\n\t"\
361370 "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
362371 "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
363372 "leaq (%1,%%r12,4),%1;"\
364- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
373+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
374+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
365375 ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
366376 "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
367377 a_block_pointer -= M * K;\
368378}
369379#define COMPUTE_n24 {\
380+ b_pref = packed_b_pointer + 24 * K;\
370381 __asm__ __volatile__(\
371382 "vbroadcastsd (%9),%%zmm3;"\
372383 "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
376387 INNER_KERNELm8(24)\
377388 INNER_SAVE_m8n24\
378389 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
379- "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4; addq $64,%3;"\
390+ "addq $64,%3;"\
380391 "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
381392 "22222:\n\t"\
382393 "cmpq $4,%8; jb 22223f;"\
383394 INNER_INIT_m4n24\
384395 INNER_KERNELm4(24)\
385396 INNER_SAVE_m4n24\
386397 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
387- "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4; addq $32,%3;"\
398+ "addq $32,%3;"\
388399 "subq $4,%8;"\
389400 "22223:\n\t"\
390401 "cmpq $2,%8; jb 22224f;"\
391402 INNER_INIT_m2n24\
392403 INNER_KERNELm2(24)\
393404 INNER_SAVE_m2n24\
394405 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
395- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $16,%3;"\
406+ "addq $16,%3;"\
396407 "subq $2,%8;"\
397408 "22224:\n\t"\
398409 "cmpq $1,%8; jb 22225f;"\
399410 INNER_INIT_m1n24\
400411 INNER_KERNELm1(24)\
401412 INNER_SAVE_m1n24\
402413 "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
403- "shlq $4,%4;subq %4,%3;shrq $4,%4; addq $8,%3;"\
414+ "addq $8,%3;"\
404415 "22225:\n\t"\
405416 "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
406417 "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
407418 "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
408- :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
419+ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
420+ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
409421 ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
410422 "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
411423 a_block_pointer -= M * K;\
@@ -415,8 +427,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
415427 if (k == 0 || m == 0 || ndiv8 == 0 ) return ;
416428 int64_t ldc_in_bytes = (int64_t )LDC * sizeof (double );
417429 int64_t K = (int64_t )k ; int64_t M = (int64_t )m ;
418- double * a_block_pointer ;
419- double * c_pointer = c ;
430+ double * a_block_pointer , * b_pref ;
431+ double * c_pointer = c , * c_store = c ;
420432 __mmask16 k01 = 0x00f0 ,k02 = 0x000f ,k03 = 0x0033 ;
421433 BLASLONG ndiv8_count ;
422434 double * packed_b_pointer = packed_b ;
0 commit comments