|
104 | 104 | KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ |
105 | 105 | #ndim"8883:\n\t"\ |
106 | 106 | "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim |
| 107 | + |
107 | 108 | /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ |
108 | 109 | #define KERNEL_k1m4n1 \ |
109 | 110 | "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ |
|
137 | 138 | "decq %5; jnz "#ndim"4441b;"\ |
138 | 139 | #ndim"4442:\n\t"\ |
139 | 140 | SAVE_m4n##ndim |
| 141 | + |
140 | 142 | /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ |
141 | 143 | #if A_CONJ == B_CONJ |
142 | 144 | #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
|
189 | 191 | "decq %5; jnz "#ndim"2221b;"\ |
190 | 192 | #ndim"2222:\n\t"\ |
191 | 193 | SAVE_m2n##ndim |
| 194 | + |
192 | 195 | /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ |
193 | 196 | #if A_CONJ == B_CONJ |
194 | 197 | #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" |
|
242 | 245 | "decq %5; jnz "#ndim"1111b;"\ |
243 | 246 | #ndim"1112:\n\t"\ |
244 | 247 | SAVE_m1n##ndim |
| 248 | + |
245 | 249 | #define COMPUTE(ndim) {\ |
246 | 250 | b_pref = b_ptr + ndim * K *2;\ |
247 | 251 | __asm__ __volatile__ (\ |
|
266 | 270 | "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ |
267 | 271 | a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ |
268 | 272 | } |
| 273 | + |
269 | 274 | int __attribute__ ((noinline)) |
270 | 275 | CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) |
271 | 276 | { |
|
0 commit comments