|
2 | 2 | #include <stdint.h> |
3 | 3 | #include <immintrin.h> |
4 | 4 |
|
5 | | -//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. |
| 5 | +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. |
| 6 | + |
6 | 7 | /* row-major c_block */ |
7 | 8 | #define INNER_KERNEL_k1m1n8 \ |
8 | 9 | "prefetcht0 384(%1);"\ |
|
13 | 14 | INNER_KERNEL_k1m1n8\ |
14 | 15 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" |
15 | 16 |
|
16 | | -#define INNER_KERNEL_k1m4n8 \ |
17 | | - INNER_KERNEL_k1m2n8\ |
18 | | - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ |
19 | | - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" |
20 | | - |
21 | | -#define INNER_KERNEL_k1m8n8 \ |
22 | | - INNER_KERNEL_k1m4n8\ |
23 | | - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ |
24 | | - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ |
25 | | - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ |
26 | | - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" |
27 | | - |
28 | 17 | #define INNER_KERNEL_k1m1n16 \ |
29 | 18 | "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ |
30 | 19 | "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ |
|
34 | 23 | INNER_KERNEL_k1m1n16\ |
35 | 24 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" |
36 | 25 |
|
37 | | -#define INNER_KERNEL_k1m4n16 \ |
38 | | - INNER_KERNEL_k1m2n16\ |
39 | | - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ |
40 | | - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" |
41 | | - |
42 | | -#define INNER_KERNEL_k1m8n16 \ |
43 | | - INNER_KERNEL_k1m4n16\ |
44 | | - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ |
45 | | - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ |
46 | | - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ |
47 | | - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" |
48 | | - |
49 | 26 | #define INNER_KERNEL_k1m1n24 \ |
50 | 27 | "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ |
51 | 28 | "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ |
|
55 | 32 | INNER_KERNEL_k1m1n24\ |
56 | 33 | "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" |
57 | 34 |
|
| 35 | +/* row-major z-partition c_block */ |
| 36 | +#define INNER_KERNEL_k1m4n8 \ |
| 37 | + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ |
| 38 | + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ |
| 39 | + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" |
| 40 | + |
| 41 | +#define INNER_KERNEL_k1m4n16 \ |
| 42 | + INNER_KERNEL_k1m4n8\ |
| 43 | + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ |
| 44 | + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" |
| 45 | + |
58 | 46 | #define INNER_KERNEL_k1m4n24 \ |
59 | | - INNER_KERNEL_k1m2n24\ |
60 | | - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ |
61 | | - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" |
| 47 | + INNER_KERNEL_k1m4n16\ |
| 48 | + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ |
| 49 | + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" |
62 | 50 |
|
63 | | -#define INNER_KERNEL_k1m8n24 \ |
64 | | - INNER_KERNEL_k1m4n24\ |
65 | | - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ |
66 | | - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ |
67 | | - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ |
68 | | - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" |
| 51 | +#define INNER_KERNEL_k1m8n8 \ |
| 52 | + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ |
| 53 | + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ |
| 54 | + "prefetcht0 128(%1);"\ |
| 55 | + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ |
| 56 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ |
| 57 | + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ |
| 58 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" |
| 59 | + |
| 60 | +#define INNER_KERNEL_k1m8n16 \ |
| 61 | + INNER_KERNEL_k1m8n8\ |
| 62 | + "prefetcht0 128(%1,%%r12,2);"\ |
| 63 | + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ |
| 64 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ |
| 65 | + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ |
| 66 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" |
69 | 67 |
|
| 68 | +#define INNER_KERNEL_k1m8n24 \ |
| 69 | + INNER_KERNEL_k1m8n16\ |
| 70 | + "prefetcht0 128(%1,%%r12,4);"\ |
| 71 | + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ |
| 72 | + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ |
| 73 | + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ |
| 74 | + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" |
| 75 | + |
| 76 | +/* micro kernels */ |
70 | 77 | #define INNER_KERNELm1(nn) \ |
71 | 78 | "cmpq $1,%2;jb "#nn"3f;"\ |
72 | 79 | #nn"4:\n\t"\ |
|
84 | 91 | #define INNER_KERNELm4(nn) \ |
85 | 92 | "cmpq $1,%2;jb "#nn"00f;"\ |
86 | 93 | #nn"01:\n\t"\ |
87 | | - INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ |
| 94 | + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ |
88 | 95 | "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ |
89 | 96 | #nn"00:\n\t" |
90 | 97 |
|
91 | 98 | /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ |
92 | 99 | #define INNER_KERNELm8(nn) \ |
93 | | - "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ |
| 100 | + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ |
94 | 101 | #nn"008:\n\t"\ |
95 | | - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
96 | | - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
| 102 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 103 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 104 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
97 | 105 | "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ |
98 | | - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
99 | | - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
100 | | - "prefetcht1 (%11); addq $16,%11;"\ |
101 | | - "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ |
| 106 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 107 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 108 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
| 109 | + "prefetcht1 (%11); addq $32,%11;"\ |
| 110 | + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ |
102 | 111 | "movq %3,%10;"\ |
103 | 112 | #nn"001:\n\t"\ |
104 | 113 | "cmpq $1,%2;jb "#nn"000f;"\ |
105 | 114 | "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ |
106 | | - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ |
| 115 | + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ |
107 | 116 | "decq %2;jmp "#nn"001b;"\ |
108 | 117 | ""#nn"000:\n\t" |
109 | 118 |
|
|
207 | 216 | INNER_STORE_m1n8(%%zmm13,8) |
208 | 217 |
|
209 | 218 | #define INNER_TRANS_4x8(c1,c2,c3,c4) \ |
210 | | - "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ |
211 | | - "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ |
212 | | - "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ |
213 | | - "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ |
214 | | - "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ |
215 | | - "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" |
| 219 | + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ |
| 220 | + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ |
| 221 | + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ |
| 222 | + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ |
| 223 | + |
| 224 | +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ |
| 225 | + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ |
| 226 | + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ |
| 227 | + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ |
| 228 | + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" |
216 | 229 |
|
217 | 230 | #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ |
218 | | - INNER_TRANS_4x8(c1,c2,c3,c4)\ |
219 | | - INNER_TRANS_4x8(c5,c6,c7,c8)\ |
220 | | - "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ |
221 | | - "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ |
222 | | - "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ |
223 | | - "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ |
224 | | - "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ |
225 | | - "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ |
226 | | - "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ |
227 | | - "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" |
| 231 | + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) |
228 | 232 |
|
229 | 233 | //%7 for k01(input) only when m=4 |
230 | 234 | #define INNER_STORE_4x8(c1,c2,c3,c4) \ |
|
250 | 254 | INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) |
251 | 255 |
|
252 | 256 | #define INNER_SAVE_m4n16 \ |
253 | | - "movq %3,%10;"\ |
254 | | - INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ |
255 | | - INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ |
256 | | - INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ |
257 | | - INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) |
| 257 | + INNER_SAVE_m4n8\ |
| 258 | + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ |
| 259 | + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) |
258 | 260 |
|
259 | 261 | #define INNER_SAVE_m4n24 \ |
260 | | - "movq %3,%10;"\ |
261 | | - INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ |
262 | | - INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ |
263 | | - INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ |
264 | | - INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ |
265 | | - INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ |
266 | | - INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) |
| 262 | + INNER_SAVE_m4n16\ |
| 263 | + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ |
| 264 | + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) |
267 | 265 |
|
268 | 266 | #define INNER_SAVE_m8n8 \ |
269 | 267 | "movq %3,%10;"\ |
270 | 268 | INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ |
271 | 269 | INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) |
272 | 270 |
|
273 | 271 | #define INNER_SAVE_m8n16 \ |
274 | | - "movq %3,%10;"\ |
275 | | - INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ |
276 | | - INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ |
277 | | - INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ |
278 | | - INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) |
| 272 | + INNER_SAVE_m8n8\ |
| 273 | + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ |
| 274 | + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) |
279 | 275 |
|
280 | 276 | #define INNER_SAVE_m8n24 \ |
281 | | - "movq %3,%10;"\ |
282 | | - INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ |
283 | | - INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ |
284 | | - INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ |
285 | | - INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ |
286 | | - INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ |
287 | | - INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) |
| 277 | + INNER_SAVE_m8n16\ |
| 278 | + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ |
| 279 | + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) |
288 | 280 |
|
289 | 281 | #define COMPUTE_n8 {\ |
290 | 282 | b_pref = packed_b_pointer + 8 * K;\ |
|
327 | 319 | "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ |
328 | 320 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ |
329 | 321 | "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ |
330 | | - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ |
| 322 | + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ |
331 | 323 | a_block_pointer -= M * K;\ |
332 | 324 | } |
333 | 325 | #define COMPUTE_n16 {\ |
|
372 | 364 | "leaq (%1,%%r12,4),%1;"\ |
373 | 365 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ |
374 | 366 | "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ |
375 | | - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ |
| 367 | + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ |
376 | 368 | "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ |
377 | 369 | a_block_pointer -= M * K;\ |
378 | 370 | } |
|
417 | 409 | "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ |
418 | 410 | "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ |
419 | 411 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ |
420 | | - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ |
421 | | - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ |
422 | | - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ |
| 412 | + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ |
| 413 | + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ |
| 414 | + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ |
423 | 415 | a_block_pointer -= M * K;\ |
424 | 416 | } |
425 | 417 | static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 |
|
0 commit comments