@@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5656#define VI3 $vr8
5757#define VI4 $vr19
5858#define VT0 $vr23
59+ #define VZE $vr3
60+ #define VT1 $vr4
61+ #define VT2 $vr5
62+ #define VC0 $vr6
5963
6064 PROLOGUE
6165 li.d i0, 0
6266 bge $r0, N, .L999
6367 bge $r0, INCX, .L999
6468 li.d TEMP, 1
69+ vldi VZE, 0
6570 slli.d TEMP, TEMP, BASE_SHIFT
6671 slli.d INCX, INCX, BASE_SHIFT
6772 bne INCX, TEMP, .L20
6873 vld VM0, X, 0
6974#ifdef DOUBLE
75+ vfsub.d VT1, VZE, VM0
7076 addi.d i0, i0, 1
7177 srai.d I, N, 3
78+ vfmaxa.d VM0, VM0, VT1
7279 bge $r0, I, .L11
7380 slli.d i0, i0, 1 //2
7481 vreplgr2vr.d VINC2, i0
@@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7986 addi.d i0, i0, 1
8087 vinsgr2vr.d VI1, i0, 1
8188 addi.d i0, i0, 3
82- vinsgr2vr.d VI0, i0, 0 //1
89+ vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
8390 addi.d i0, i0, 1
84- vinsgr2vr.d VI0, i0, 1 //2
91+ vinsgr2vr.d VI0, i0, 1
8592#else
93+ vfsub.s VT1, VZE, VM0
8694 addi.w i0, i0, 1
8795 srai.d I, N, 3
96+ vfmaxa.s VM0, VM0, VT1
8897 bge $r0, I, .L21
8998 slli.w i0, i0, 2 //4
9099 vreplgr2vr.w VINC2, i0
@@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
115124 vadd.d VI1, VI1, VINC4
116125 vld VX1, X, 2 * SIZE
117126 vadd.d VI2, VI1, VINC2
118- vfmaxa.d x1, VX0, VX1
119- vfcmp.ceq.d VT0, VX0, x1
120- vbitsel.v x2, VI2, VI1, VT0
127+ vfsub.d VT1, VZE, VX0
128+ vfsub.d VT2, VZE, VX1
129+ vfmaxa.d VX0, VX0, VT1
130+ vfmaxa.d VX1, VX1, VT2
131+ vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
132+ vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
133+ vbitsel.v x2, VI1, VI2, VT0 //i
134+
121135 vld VX0, X, 4 * SIZE
122136 vadd.d VI1, VI2, VINC2
123137 vld VX1, X, 6 * SIZE
124138 vadd.d VI2, VI1, VINC2
125- vfmaxa.d x3, VX0, VX1
126- vfcmp.ceq.d VT0, VX0, x3
127- vbitsel.v x4, VI2, VI1, VT0
128- vfmaxa.d x3, x1, x3
129- vfcmp.ceq.d VT0, x1, x3
130- vbitsel.v x2, x4, x2, VT0
131- vfmaxa.d VM1, VM0, x3
132- vfcmp.ceq.d VT0, VM0, VM1
133- vbitsel.v VM0, VM1, VM0, VT0
134- vbitsel.v VI0, x2, VI0, VT0
139+ vfsub.d VT1, VZE, VX0
140+ vfsub.d VT2, VZE, VX1
141+ vfmaxa.d VX0, VX0, VT1
142+ vfmaxa.d VX1, VX1, VT2
143+ vfcmp.clt.d VT0, VX0, VX1
144+ vbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
145+ vbitsel.v x4, VI1, VI2, VT0 //i
146+ vfcmp.clt.d VC0, x1, x3
147+ vbitsel.v x1, x1, x3, VC0 //abs(maxf)
148+ vbitsel.v x2, x2, x4, VC0 //i
149+ vfcmp.clt.d VT0, VM0, x1
135150 addi.d I, I, -1
136151 addi.d X, X, 8 * SIZE
152+ vbitsel.v VM0, VM0, x1, VT0
153+ vbitsel.v VI0, VI0, x2, VT0
137154#else
138155 vld VX0, X, 0 * SIZE
139156 vadd.w VI1, VI1, VINC4
140157 vld VX1, X, 4 * SIZE
141158 vadd.w VI2, VI1, VINC2
142- vfmaxa.s VM1, VX0, VX1
143- vfcmp.ceq.s VT0, VX0, VM1
159+ vfsub.s VT1, VZE, VX0
160+ vfsub.s VT2, VZE, VX1
161+ vfmaxa.s VX0, VX0, VT1
162+ vfmaxa.s VX1, VX1, VT2
163+ vfcmp.clt.s VT0, VX0, VX1
164+ vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
165+ vbitsel.v x2, VI1, VI2, VT0 //i
144166 addi.d I, I, -1
145- vbitsel.v VI2, VI2, VI1, VT0
146- vfmaxa.s VM1, VM0, VM1
147- vfcmp.ceq.s VT0, VM0, VM1
167+ vfcmp.clt.s VT0, VM0, x1
148168 addi.d X, X, 8 * SIZE
149- vbitsel.v VM0, VM1, VM0, VT0
150- vbitsel.v VI0, VI2, VI0, VT0
169+ vbitsel.v VM0, VM0, x1, VT0
170+ vbitsel.v VI0, VI0, x2, VT0
171+
151172#endif
152173 blt $r0, I, .L10
153174 .align 3
@@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
158179 vreplvei.d VI2, VI0, 1
159180 vreplvei.d x1, VM0, 0
160181 vreplvei.d x2, VM0, 1
161- fcmp.ceq.d $fcc0, $f10 , $f9
182+ fcmp.ceq.d $fcc0, $f9 , $f10
162183 bceqz $fcc0, .L16
163184 vfcmp.clt.d VT0, VI1, VI2
164185 vbitsel.v VI0, VI2, VI1, VT0
@@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
172193 vreplvei.w x2, VM0, 1
173194 vreplvei.w x3, VM0, 2
174195 vreplvei.w x4, VM0, 3
175- vfmaxa.s VM1, x1, x2
176- vfcmp.ceq.s VT0, VM1, x1
177- vbitsel.v VINC2, VI2, VI1, VT0
178- vfmaxa.s VM0, x3, x4
179- vfcmp.ceq.s VT0, x3, VM0
180- vbitsel.v VINC4, VI4, VI3, VT0
181- vfmaxa.s VM0, VM0, VM1
182- vfcmp.ceq.s VT0, VM0, VM1
183- vbitsel.v VI0, VINC4, VINC2, VT0
184- fcmp.ceq.d $fcc0, $f15, $f9
185- bceqz $fcc0, .L26
186- vfcmp.clt.s VT0, VI1, VI0
187- vbitsel.v VI0, VI0, VI1, VT0
188196 b .L26
189197#endif
190198 .align 3
191199
192200#ifdef DOUBLE
193201.L16:
194- vfmaxa.d VM0 , x1, x2
195- vfcmp.ceq.d VT0, x1, VM0
196- vbitsel.v VI0, VI2, VI1 , VT0
202+ vfcmp.clt.d VT0 , x1, x2
203+ vbitsel.v VI0, VI1, VI2, VT0
204+ vbitsel.v VM0, x1, x2 , VT0
197205 .align 3
198206
199207.L17:
@@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
212220
213221.L13:
214222 fld .d $f9, X, 0
215- vfmaxa.d VM1, x1, VM0
216- vfcmp.ceq.d VT0, VM0, VM1
217- vbitsel.v VM0, VM1, VM0, VT0
218- vbitsel.v VI0, VI1, VI0, VT0
223+ fsub .d $f10, $f3, $f9
224+ vfmaxa.d x1, x1, x2
225+ vfcmp.clt.d VT0, VM0, x1
226+ vbitsel.v VM0, VM0, x1, VT0
227+ vbitsel.v VI0, VI0, VI1, VT0
219228 addi.d I, I, -1
220229 addi.d i1, i1, 1
221230 addi.d X, X, SIZE
@@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
241250 add .d TEMP, TEMP, INCX
242251 vinsgr2vr.d VM0, t2, 1
243252 slli.d i0, i0, 1 //2
253+ vfsub.d VT1, VZE, VM0
244254 vreplgr2vr.d VINC2, i0
245255 slli.d i0, i0, 1 //4
246256 vreplgr2vr.d VINC4, i0
247257 addi.d i0, i0, -7
258+ vfmaxa.d VM0, VM0, VT1
248259 vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
249260 addi.d i0, i0, 1
250261 vinsgr2vr.d VI1, i0, 1
@@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
269280 add .d X, X, INCX
270281 vinsgr2vr.d VX1, t2, 1
271282 vadd.d VI2, VI1, VINC2
272- vfmaxa.d x1, VX0, VX1
273- vfcmp.ceq.d VT0, VX0, x1
274- vbitsel.v x2, VI2, VI1, VT0
283+
284+ vfsub.d VT1, VZE, VX0
285+ vfsub.d VT2, VZE, VX1
286+ vfmaxa.d VX0, VX0, VT1
287+ vfmaxa.d VX1, VX1, VT2
288+ vfcmp.clt.d VT0, VX0, VX1
289+ vbitsel.v x1, VX0, VX1, VT0
290+ vbitsel.v x2, VI1, VI2, VT0
275291 ld.d t1, X, 0 * SIZE
276292 add .d X, X, INCX
277293 vinsgr2vr.d VX0, t1, 0
@@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
286302 add .d X, X, INCX
287303 vinsgr2vr.d VX1, t2, 1
288304 vadd.d VI2, VI1, VINC2
289- vfmaxa.d x3, VX0, VX1
290- vfcmp.ceq.d VT0, VX0, x3
291- vbitsel.v x4, VI2, VI1, VT0
292- vfmaxa.d x3, x1, x3
293- vfcmp.ceq.d VT0, x1, x3
294- vbitsel.v x2, x4, x2, VT0
295- vfmaxa.d VM1, VM0, x3
296- vbitsel.v VM0, VM1, VM0, VT0
297- vfcmp.ceq.d VT0, VM0, VM1
298- vbitsel.v VI0, x2, VI0, VT0
305+ vfsub.d VT1, VZE, VX0
306+ vfsub.d VT2, VZE, VX1
307+ vfmaxa.d VX0, VX0, VT1
308+ vfmaxa.d VX1, VX1, VT2
309+ vfcmp.clt.d VT0, VX0, VX1
310+ vbitsel.v x3, VX0, VX1, VT0
311+ vbitsel.v x4, VI1, VI2, VT0
312+ vfcmp.clt.d VC0, x1, x3
313+ vbitsel.v x1, x1, x3, VC0
314+ vbitsel.v x2, x2, x4, VC0
315+ vfcmp.clt.d VT0, VM0, x1
316+ vbitsel.v VM0, VM0, x1, VT0
317+ vbitsel.v VI0, VI0, x2, VT0
318+
299319 addi.d I, I, -1
300320 blt $r0, I, .L24
301321 .align 3
@@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313333 .align 3
314334
315335.L26:
316- vfmaxa.d VM0 , x1, x2
317- vfcmp.ceq.d VT0, x1, VM0
318- vbitsel.v VI0, VI2, VI1 , VT0
336+ vfcmp.clt.d VT0 , x1, x2
337+ vbitsel.v VI0, VI1, VI2, VT0
338+ vbitsel.v VM0, x1, x2 , VT0
319339 .align 3
320340
321341.L27:
@@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
389409 vinsgr2vr.w VX1, t3, 2
390410 vinsgr2vr.w VX1, t4, 3
391411 vadd.w VI2, VI1, VINC2
392- vfmaxa.s VM1, VX0, VX1
393- vfcmp.ceq.s VT0, VX0, VM1
394- vbitsel.v VI2, VI2, VI1, VT0
395- vfmaxa.s VM1, VM0, VM1
396- vfcmp.ceq.s VT0, VM0, VM1
412+ vfsub.s VT1, VZE, VX0
413+ vfsub.s VT2, VZE, VX1
414+ vfmaxa.s VX0, VX0, VT1
415+ vfmaxa.s VX1, VX1, VT2
416+ vfcmp.clt.s VT0, VX0, VX1
417+ vbitsel.v x1, VX0, VX1, VT0
418+ vbitsel.v x2, VI1, VI2, VT0 //i
419+
397420 addi.d I, I, -1
398- vbitsel.v VM0, VM1, VM0, VT0
399- vbitsel.v VI0, VI2, VI0, VT0
421+ vfcmp.clt.s VT0, VM0, x1
422+ vbitsel.v VM0, VM0, x1, VT0
423+ vbitsel.v VI0, VI0, x2, VT0
400424 blt $r0, I, .L24
401425 .align 3
402426
@@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
409433 vreplvei.w x2, VM0, 1
410434 vreplvei.w x3, VM0, 2
411435 vreplvei.w x4, VM0, 3
412- vfmaxa.s VM1, x1, x2
413- vfcmp.ceq.s VT0, VM1, x1
414- vbitsel.v VINC2, VI2, VI1, VT0
415- vfmaxa.s VM0, x3, x4
416- vfcmp.ceq.s VT0, x3, VM0
417- vbitsel.v VINC4, VI4, VI3, VT0
418- vfmaxa.s VM0, VM0, VM1
419- vfcmp.ceq.s VT0, VM0, VM1
420- vbitsel.v VI0, VINC4, VINC2, VT0
421- fcmp.ceq.d $fcc0, $f15, $f9
422- bceqz $fcc0, .L26
423- vfcmp.clt.s VT0, VI1, VI0
424- vbitsel.v VI0, VI0, VI1, VT0
425436 .align 3
426437
427438.L26:
428- fcmp.ceq.d $fcc0, $f15, $f10
429- bceqz $fcc0, .L27
430- vfcmp.clt.s VT0, VI2, VI0
431- vbitsel.v VI0, VI0, VI2, VT0
439+ fcmp.ceq.s $fcc0, $f9, $f10
440+ bceqz $fcc0, .L31
441+ vfcmp.clt.s VT0, VI1, VI2
442+ vbitsel.v VI1, VI2, VI1, VT0
443+ b .L32
432444 .align 3
433-
434- .L27:
435- fcmp.ceq.d $fcc0, $f15, $f11
436- bceqz $fcc0, .L28
437- vfcmp.clt.s VT0, VI3, VI0
438- vbitsel.v VI0, VI0, VI3, VT0
445+ .L31:
446+ vfcmp.clt.s VT0, x1, x2
447+ vbitsel.v VI1, VI1, VI2, VT0
448+ vbitsel.v x1, x1, x2, VT0
439449 .align 3
440-
441- .L28:
442- fcmp.ceq.d $fcc0, $f15, $f12
443- bceqz $fcc0, .L29
444- vfcmp.clt.s VT0, VI4, VI0
445- vbitsel.v VI0, VI0, VI4, VT0
450+ .L32:
451+ fcmp.ceq.s $fcc0, $f11, $f12
452+ bceqz $fcc0, .L33
453+ vfcmp.clt.s VT1, VI3, VI4
454+ vbitsel.v VI3, VI4, VI3, VT1
455+ b .L34
456+ .align 3
457+ .L33:
458+ vfcmp.clt.s VT1, x3, x4
459+ vbitsel.v x3, x3, x4, VT1
460+ vbitsel.v VI3, VI3, VI4, VT1
461+ .align 3
462+ .L34:
463+ fcmp.ceq.s $fcc0, $f9, $f11
464+ bceqz $fcc0, .L35
465+ vfcmp.clt.s VT0, VI1, VI3
466+ vbitsel.v VI0, VI3, VI1, VT0
467+ vxor.v VM0, x1, VZE
468+ b .L29
469+ .align 3
470+ .L35:
471+ vfcmp.clt.s VT0, x1, x3
472+ vbitsel.v VM0, x1, x3, VT0
473+ vbitsel.v VI0, VI1, VI3, VT0
446474 .align 3
447-
448475.L29:
449476 movfr2gr.s i0, $f20
450477 .align 3
@@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
462489
463490.L22:
464491 LD $f9, X, 0
465- VFMAXA VM1, x1, VM0
466- VCMPEQ VT0, VM0, VM1
467- vbitsel.v VM0, VM1, VM0, VT0
468- vbitsel.v VI0, VI1, VI0, VT0
492+ #ifdef DOUBLE
493+ fsub .d $f10, $f3, $f9
494+ vfmaxa.d x1, x1, x2
495+ vfcmp.clt.d VT0, VM0, x1
496+ #else
497+ fsub .s $f10, $f3, $f9
498+ vfmaxa.s x1, x1, x2
499+ vfcmp.clt.s VT0, VM0, x1
500+ #endif
501+ vbitsel.v VM0, VM0, x1, VT0
502+ vbitsel.v VI0, VI0, VI1, VT0
469503 addi.d I, I, -1
470504 addi.d i1, i1, 1
471505 add .d X, X, INCX
0 commit comments