From 0481cba126037d765818eb2f79d0dbccb4c79307 Mon Sep 17 00:00:00 2001
From: Sean Parkinson <sean@wolfssl.com>
Date: Sat, 20 Jun 2026 08:56:52 +1000
Subject: [PATCH] ARM64 ASM: optimizations

Fewer instructions in assembly for minor improvements.
---
 wolfcrypt/src/port/arm/armv8-aes-asm.S      | 42 ++++++++-------------
 wolfcrypt/src/port/arm/armv8-aes-asm_c.c    | 42 ++++++++-------------
 wolfcrypt/src/port/arm/armv8-chacha-asm.S   |  3 +-
 wolfcrypt/src/port/arm/armv8-chacha-asm_c.c |  3 +-
 4 files changed, 32 insertions(+), 58 deletions(-)

diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S
index fa48e67b17..0909a9eb79 100644
--- a/wolfcrypt/src/port/arm/armv8-aes-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S
@@ -44510,37 +44510,32 @@ L_AES_CTR_encrypt_NEON_loop_4:
 	ld1	{v4.2d}, [x9], #16
 	mov	v8.d[1], x10
 	mov	v8.d[0], x11
-	rev64	v8.16b, v8.16b
-	rev32	v8.16b, v8.16b
+	rev64	v8.4s, v8.4s
 	# Round: 0 - XOR in key schedule
 	eor	v0.16b, v8.16b, v4.16b
 	adds	x10, x10, #1
 	adc	x11, x11, xzr
 	mov	v8.d[1], x10
 	mov	v8.d[0], x11
-	rev64	v8.16b, v8.16b
-	rev32	v8.16b, v8.16b
+	rev64	v8.4s, v8.4s
 	eor	v1.16b, v8.16b, v4.16b
 	adds	x10, x10, #1
 	adc	x11, x11, xzr
 	mov	v8.d[1], x10
 	mov	v8.d[0], x11
-	rev64	v8.16b, v8.16b
-	rev32	v8.16b, v8.16b
+	rev64	v8.4s, v8.4s
 	eor	v2.16b, v8.16b, v4.16b
 	adds	x10, x10, #1
 	adc	x11, x11, xzr
 	mov	v8.d[1], x10
 	mov	v8.d[0], x11
-	rev64	v8.16b, v8.16b
-	rev32	v8.16b, v8.16b
+	rev64	v8.4s, v8.4s
 	eor	v3.16b, v8.16b, v4.16b
 	adds	x10, x10, #1
 	adc	x11, x11, xzr
 	mov	v8.d[1], x10
 	mov	v8.d[0], x11
-	rev64	v8.16b, v8.16b
-	rev32	v8.16b, v8.16b
+	rev64	v8.4s, v8.4s
 	sub	w8, w4, #2
 L_AES_CTR_encrypt_NEON_loop_nr_4:
 	tbl	v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b
@@ -44939,8 +44934,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_4:
 	bge	L_AES_CTR_encrypt_NEON_loop_4
 	mov	v2.d[1], x10
 	mov	v2.d[0], x11
-	rev64	v2.16b, v2.16b
-	rev32	v2.16b, v2.16b
+	rev64	v2.4s, v2.4s
 L_AES_CTR_encrypt_NEON_start_2:
 	movi	v12.16b, #0x40
 	movi	v13.16b, #0x80
@@ -44958,15 +44952,13 @@ L_AES_CTR_encrypt_NEON_loop_2:
 	adc	x11, x11, xzr
 	mov	v2.d[1], x10
 	mov	v2.d[0], x11
-	rev64	v2.16b, v2.16b
-	rev32	v2.16b, v2.16b
+	rev64	v2.4s, v2.4s
 	eor	v1.16b, v2.16b, v4.16b
 	adds	x10, x10, #1
 	adc	x11, x11, xzr
 	mov	v2.d[1], x10
 	mov	v2.d[0], x11
-	rev64	v2.16b, v2.16b
-	rev32	v2.16b, v2.16b
+	rev64	v2.4s, v2.4s
 	sub	w8, w4, #2
 L_AES_CTR_encrypt_NEON_loop_nr_2:
 	eor	v8.16b, v0.16b, v12.16b
@@ -45283,8 +45275,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_1:
 	adc	x11, x11, xzr
 	mov	v2.d[1], x10
 	mov	v2.d[0], x11
-	rev64	v2.16b, v2.16b
-	rev32	v2.16b, v2.16b
+	rev64	v2.4s, v2.4s
 L_AES_CTR_encrypt_NEON_data_done:
 	rev32	v2.16b, v2.16b
 	st1	{v2.2d}, [x5]
@@ -49782,8 +49773,7 @@ _AES_XTS_decrypt_NEON:
 	mov	x17, #0x87
 	ands	w19, w2, #15
 	cset	w16, ne
-	lsl	w16, w16, #4
-	sub	w2, w2, w16
+	sub	w2, w2, w16, lsl 4
 	ld1	{v2.2d}, [x3]
 	ld1	{v4.2d}, [x5]
 	rev32	v2.16b, v2.16b
@@ -51681,11 +51671,10 @@ L_AES_set_encrypt_key_loop_256:
 	stp	w6, w7, [x2]
 	stnp	w8, w9, [x2, #8]
 	sub	x2, x2, #16
-	mov	w3, w9
-	ubfx	w6, w3, #8, #8
-	ubfx	w7, w3, #16, #8
-	ubfx	w8, w3, #24, #8
-	ubfx	w3, w3, #0, #8
+	ubfx	w6, w9, #8, #8
+	ubfx	w7, w9, #16, #8
+	ubfx	w8, w9, #24, #8
+	ubfx	w3, w9, #0, #8
 	lsl	w6, w6, #2
 	lsl	w7, w7, #2
 	lsl	w8, w8, #2
@@ -55626,8 +55615,7 @@ _AES_XTS_decrypt:
 #endif /* __APPLE__ */
 	ands	w11, w2, #15
 	cset	w11, ne
-	lsl	w11, w11, #4
-	sub	w2, w2, w11
+	sub	w2, w2, w11, lsl 4
 	mov	x11, #0x87
 	mov	x28, x5
 	ldp	x23, x24, [x3]
diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
index 7c6e43e972..a1b727fd2b 100644
--- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c
@@ -44842,37 +44842,32 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "ld1	{v4.2d}, [x9], #16\n\t"
         "mov	v8.d[1], x10\n\t"
         "mov	v8.d[0], x11\n\t"
-        "rev64	v8.16b, v8.16b\n\t"
-        "rev32	v8.16b, v8.16b\n\t"
+        "rev64	v8.4s, v8.4s\n\t"
         /* Round: 0 - XOR in key schedule */
         "eor	v0.16b, v8.16b, v4.16b\n\t"
         "adds	x10, x10, #1\n\t"
         "adc	x11, x11, xzr\n\t"
         "mov	v8.d[1], x10\n\t"
         "mov	v8.d[0], x11\n\t"
-        "rev64	v8.16b, v8.16b\n\t"
-        "rev32	v8.16b, v8.16b\n\t"
+        "rev64	v8.4s, v8.4s\n\t"
         "eor	v1.16b, v8.16b, v4.16b\n\t"
         "adds	x10, x10, #1\n\t"
         "adc	x11, x11, xzr\n\t"
         "mov	v8.d[1], x10\n\t"
         "mov	v8.d[0], x11\n\t"
-        "rev64	v8.16b, v8.16b\n\t"
-        "rev32	v8.16b, v8.16b\n\t"
+        "rev64	v8.4s, v8.4s\n\t"
         "eor	v2.16b, v8.16b, v4.16b\n\t"
         "adds	x10, x10, #1\n\t"
         "adc	x11, x11, xzr\n\t"
         "mov	v8.d[1], x10\n\t"
         "mov	v8.d[0], x11\n\t"
-        "rev64	v8.16b, v8.16b\n\t"
-        "rev32	v8.16b, v8.16b\n\t"
+        "rev64	v8.4s, v8.4s\n\t"
         "eor	v3.16b, v8.16b, v4.16b\n\t"
         "adds	x10, x10, #1\n\t"
         "adc	x11, x11, xzr\n\t"
         "mov	v8.d[1], x10\n\t"
         "mov	v8.d[0], x11\n\t"
-        "rev64	v8.16b, v8.16b\n\t"
-        "rev32	v8.16b, v8.16b\n\t"
+        "rev64	v8.4s, v8.4s\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
     "L_AES_CTR_encrypt_NEON_loop_nr_4_%=:\n\t"
@@ -45272,8 +45267,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "b.ge	L_AES_CTR_encrypt_NEON_loop_4_%=\n\t"
         "mov	v2.d[1], x10\n\t"
         "mov	v2.d[0], x11\n\t"
-        "rev64	v2.16b, v2.16b\n\t"
-        "rev32	v2.16b, v2.16b\n\t"
+        "rev64	v2.4s, v2.4s\n\t"
         "\n"
     "L_AES_CTR_encrypt_NEON_start_2_%=:\n\t"
         "movi	v12.16b, #0x40\n\t"
@@ -45293,15 +45287,13 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "adc	x11, x11, xzr\n\t"
         "mov	v2.d[1], x10\n\t"
         "mov	v2.d[0], x11\n\t"
-        "rev64	v2.16b, v2.16b\n\t"
-        "rev32	v2.16b, v2.16b\n\t"
+        "rev64	v2.4s, v2.4s\n\t"
         "eor	v1.16b, v2.16b, v4.16b\n\t"
         "adds	x10, x10, #1\n\t"
         "adc	x11, x11, xzr\n\t"
         "mov	v2.d[1], x10\n\t"
         "mov	v2.d[0], x11\n\t"
-        "rev64	v2.16b, v2.16b\n\t"
-        "rev32	v2.16b, v2.16b\n\t"
+        "rev64	v2.4s, v2.4s\n\t"
         "sub	w8, %w[nr], #2\n\t"
         "\n"
     "L_AES_CTR_encrypt_NEON_loop_nr_2_%=:\n\t"
@@ -45621,8 +45613,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
         "adc	x11, x11, xzr\n\t"
         "mov	v2.d[1], x10\n\t"
         "mov	v2.d[0], x11\n\t"
-        "rev64	v2.16b, v2.16b\n\t"
-        "rev32	v2.16b, v2.16b\n\t"
+        "rev64	v2.4s, v2.4s\n\t"
         "\n"
     "L_AES_CTR_encrypt_NEON_data_done_%=:\n\t"
         "rev32	v2.16b, v2.16b\n\t"
@@ -49988,8 +49979,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
         "mov	x17, #0x87\n\t"
         "ands	w19, %w[sz], #15\n\t"
         "cset	w16, ne\n\t"
-        "lsl	w16, w16, #4\n\t"
-        "sub	%w[sz], %w[sz], w16\n\t"
+        "sub	%w[sz], %w[sz], w16, lsl 4\n\t"
         "ld1	{v2.2d}, [%x[i]]\n\t"
         "ld1	{v4.2d}, [%x[key2]]\n\t"
         "rev32	v2.16b, v2.16b\n\t"
@@ -51831,11 +51821,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
         "stp	w6, w7, [%x[ks]]\n\t"
         "stnp	w8, w9, [%x[ks], #8]\n\t"
         "sub	%x[ks], %x[ks], #16\n\t"
-        "mov	w3, w9\n\t"
-        "ubfx	w6, w3, #8, #8\n\t"
-        "ubfx	w7, w3, #16, #8\n\t"
-        "ubfx	w8, w3, #24, #8\n\t"
-        "ubfx	w3, w3, #0, #8\n\t"
+        "ubfx	w6, w9, #8, #8\n\t"
+        "ubfx	w7, w9, #16, #8\n\t"
+        "ubfx	w8, w9, #24, #8\n\t"
+        "ubfx	w3, w9, #0, #8\n\t"
         "lsl	w6, w6, #2\n\t"
         "lsl	w7, w7, #2\n\t"
         "lsl	w8, w8, #2\n\t"
@@ -55618,8 +55607,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
         "add	x29, sp, #0\n\t"
         "ands	w11, %w[sz], #15\n\t"
         "cset	w11, ne\n\t"
-        "lsl	w11, w11, #4\n\t"
-        "sub	%w[sz], %w[sz], w11\n\t"
+        "sub	%w[sz], %w[sz], w11, lsl 4\n\t"
         "mov	x11, #0x87\n\t"
         "mov	x28, %x[key2]\n\t"
         "ldp	x23, x24, [%x[i]]\n\t"
diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-chacha-asm.S
index 93e9d8e635..2b621209c9 100644
--- a/wolfcrypt/src/port/arm/armv8-chacha-asm.S
+++ b/wolfcrypt/src/port/arm/armv8-chacha-asm.S
@@ -1101,8 +1101,7 @@ L_chacha_use_over_arm64_byte_loop:
 	eor	w5, w5, w4
 	subs	x3, x3, #1
 	strb	w5, [x1], #1
-	beq	L_chacha_use_over_arm64_done
-	b	L_chacha_use_over_arm64_byte_loop
+	bne	L_chacha_use_over_arm64_byte_loop
 L_chacha_use_over_arm64_done:
 	ret
 #ifndef __APPLE__
diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
index e440bdee64..12def7fe14 100644
--- a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
+++ b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c
@@ -1024,8 +1024,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
         "eor	w5, w5, w4\n\t"
         "subs	%w[len], %w[len], #1\n\t"
         "strb	w5, [%x[output]], #1\n\t"
-        "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
-        "b	L_chacha_use_over_arm64_byte_loop_%=\n\t"
+        "b.ne	L_chacha_use_over_arm64_byte_loop_%=\n\t"
         "\n"
     "L_chacha_use_over_arm64_done_%=:\n\t"
         : [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)