From 0481cba126037d765818eb2f79d0dbccb4c79307 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Sat, 20 Jun 2026 08:56:52 +1000 Subject: [PATCH] ARM64 ASM: optimizations Fewer instructions in assembly for minor improvements. --- wolfcrypt/src/port/arm/armv8-aes-asm.S | 42 ++++++++------------- wolfcrypt/src/port/arm/armv8-aes-asm_c.c | 42 ++++++++------------- wolfcrypt/src/port/arm/armv8-chacha-asm.S | 3 +- wolfcrypt/src/port/arm/armv8-chacha-asm_c.c | 3 +- 4 files changed, 32 insertions(+), 58 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S index fa48e67b17..0909a9eb79 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S @@ -44510,37 +44510,32 @@ L_AES_CTR_encrypt_NEON_loop_4: ld1 {v4.2d}, [x9], #16 mov v8.d[1], x10 mov v8.d[0], x11 - rev64 v8.16b, v8.16b - rev32 v8.16b, v8.16b + rev64 v8.4s, v8.4s # Round: 0 - XOR in key schedule eor v0.16b, v8.16b, v4.16b adds x10, x10, #1 adc x11, x11, xzr mov v8.d[1], x10 mov v8.d[0], x11 - rev64 v8.16b, v8.16b - rev32 v8.16b, v8.16b + rev64 v8.4s, v8.4s eor v1.16b, v8.16b, v4.16b adds x10, x10, #1 adc x11, x11, xzr mov v8.d[1], x10 mov v8.d[0], x11 - rev64 v8.16b, v8.16b - rev32 v8.16b, v8.16b + rev64 v8.4s, v8.4s eor v2.16b, v8.16b, v4.16b adds x10, x10, #1 adc x11, x11, xzr mov v8.d[1], x10 mov v8.d[0], x11 - rev64 v8.16b, v8.16b - rev32 v8.16b, v8.16b + rev64 v8.4s, v8.4s eor v3.16b, v8.16b, v4.16b adds x10, x10, #1 adc x11, x11, xzr mov v8.d[1], x10 mov v8.d[0], x11 - rev64 v8.16b, v8.16b - rev32 v8.16b, v8.16b + rev64 v8.4s, v8.4s sub w8, w4, #2 L_AES_CTR_encrypt_NEON_loop_nr_4: tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b @@ -44939,8 +44934,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_4: bge L_AES_CTR_encrypt_NEON_loop_4 mov v2.d[1], x10 mov v2.d[0], x11 - rev64 v2.16b, v2.16b - rev32 v2.16b, v2.16b + rev64 v2.4s, v2.4s L_AES_CTR_encrypt_NEON_start_2: movi v12.16b, #0x40 movi v13.16b, #0x80 @@ -44958,15 +44952,13 @@ L_AES_CTR_encrypt_NEON_loop_2: adc x11, x11, xzr mov v2.d[1], x10 mov v2.d[0], x11 - rev64 v2.16b, v2.16b - rev32 v2.16b, v2.16b + rev64 v2.4s, v2.4s eor v1.16b, v2.16b, v4.16b adds x10, x10, #1 adc x11, x11, xzr mov v2.d[1], x10 mov v2.d[0], x11 - rev64 v2.16b, v2.16b - rev32 v2.16b, v2.16b + rev64 v2.4s, v2.4s sub w8, w4, #2 L_AES_CTR_encrypt_NEON_loop_nr_2: eor v8.16b, v0.16b, v12.16b @@ -45283,8 +45275,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_1: adc x11, x11, xzr mov v2.d[1], x10 mov v2.d[0], x11 - rev64 v2.16b, v2.16b - rev32 v2.16b, v2.16b + rev64 v2.4s, v2.4s L_AES_CTR_encrypt_NEON_data_done: rev32 v2.16b, v2.16b st1 {v2.2d}, [x5] @@ -49782,8 +49773,7 @@ _AES_XTS_decrypt_NEON: mov x17, #0x87 ands w19, w2, #15 cset w16, ne - lsl w16, w16, #4 - sub w2, w2, w16 + sub w2, w2, w16, lsl 4 ld1 {v2.2d}, [x3] ld1 {v4.2d}, [x5] rev32 v2.16b, v2.16b @@ -51681,11 +51671,10 @@ L_AES_set_encrypt_key_loop_256: stp w6, w7, [x2] stnp w8, w9, [x2, #8] sub x2, x2, #16 - mov w3, w9 - ubfx w6, w3, #8, #8 - ubfx w7, w3, #16, #8 - ubfx w8, w3, #24, #8 - ubfx w3, w3, #0, #8 + ubfx w6, w9, #8, #8 + ubfx w7, w9, #16, #8 + ubfx w8, w9, #24, #8 + ubfx w3, w9, #0, #8 lsl w6, w6, #2 lsl w7, w7, #2 lsl w8, w8, #2 @@ -55626,8 +55615,7 @@ _AES_XTS_decrypt: #endif /* __APPLE__ */ ands w11, w2, #15 cset w11, ne - lsl w11, w11, #4 - sub w2, w2, w11 + sub w2, w2, w11, lsl 4 mov x11, #0x87 mov x28, x5 ldp x23, x24, [x3] diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c index 7c6e43e972..a1b727fd2b 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c @@ -44842,37 +44842,32 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, "ld1 {v4.2d}, [x9], #16\n\t" "mov v8.d[1], x10\n\t" "mov v8.d[0], x11\n\t" - "rev64 v8.16b, v8.16b\n\t" - "rev32 v8.16b, v8.16b\n\t" + "rev64 v8.4s, v8.4s\n\t" /* Round: 0 - XOR in key schedule */ "eor v0.16b, v8.16b, v4.16b\n\t" "adds x10, x10, #1\n\t" "adc x11, x11, xzr\n\t" "mov v8.d[1], x10\n\t" "mov v8.d[0], x11\n\t" - "rev64 v8.16b, v8.16b\n\t" - "rev32 v8.16b, v8.16b\n\t" + "rev64 v8.4s, v8.4s\n\t" "eor v1.16b, v8.16b, v4.16b\n\t" "adds x10, x10, #1\n\t" "adc x11, x11, xzr\n\t" "mov v8.d[1], x10\n\t" "mov v8.d[0], x11\n\t" - "rev64 v8.16b, v8.16b\n\t" - "rev32 v8.16b, v8.16b\n\t" + "rev64 v8.4s, v8.4s\n\t" "eor v2.16b, v8.16b, v4.16b\n\t" "adds x10, x10, #1\n\t" "adc x11, x11, xzr\n\t" "mov v8.d[1], x10\n\t" "mov v8.d[0], x11\n\t" - "rev64 v8.16b, v8.16b\n\t" - "rev32 v8.16b, v8.16b\n\t" + "rev64 v8.4s, v8.4s\n\t" "eor v3.16b, v8.16b, v4.16b\n\t" "adds x10, x10, #1\n\t" "adc x11, x11, xzr\n\t" "mov v8.d[1], x10\n\t" "mov v8.d[0], x11\n\t" - "rev64 v8.16b, v8.16b\n\t" - "rev32 v8.16b, v8.16b\n\t" + "rev64 v8.4s, v8.4s\n\t" "sub w8, %w[nr], #2\n\t" "\n" "L_AES_CTR_encrypt_NEON_loop_nr_4_%=:\n\t" @@ -45272,8 +45267,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, "b.ge L_AES_CTR_encrypt_NEON_loop_4_%=\n\t" "mov v2.d[1], x10\n\t" "mov v2.d[0], x11\n\t" - "rev64 v2.16b, v2.16b\n\t" - "rev32 v2.16b, v2.16b\n\t" + "rev64 v2.4s, v2.4s\n\t" "\n" "L_AES_CTR_encrypt_NEON_start_2_%=:\n\t" "movi v12.16b, #0x40\n\t" @@ -45293,15 +45287,13 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, "adc x11, x11, xzr\n\t" "mov v2.d[1], x10\n\t" "mov v2.d[0], x11\n\t" - "rev64 v2.16b, v2.16b\n\t" - "rev32 v2.16b, v2.16b\n\t" + "rev64 v2.4s, v2.4s\n\t" "eor v1.16b, v2.16b, v4.16b\n\t" "adds x10, x10, #1\n\t" "adc x11, x11, xzr\n\t" "mov v2.d[1], x10\n\t" "mov v2.d[0], x11\n\t" - "rev64 v2.16b, v2.16b\n\t" - "rev32 v2.16b, v2.16b\n\t" + "rev64 v2.4s, v2.4s\n\t" "sub w8, %w[nr], #2\n\t" "\n" "L_AES_CTR_encrypt_NEON_loop_nr_2_%=:\n\t" @@ -45621,8 +45613,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, "adc x11, x11, xzr\n\t" "mov v2.d[1], x10\n\t" "mov v2.d[0], x11\n\t" - "rev64 v2.16b, v2.16b\n\t" - "rev32 v2.16b, v2.16b\n\t" + "rev64 v2.4s, v2.4s\n\t" "\n" "L_AES_CTR_encrypt_NEON_data_done_%=:\n\t" "rev32 v2.16b, v2.16b\n\t" @@ -49988,8 +49979,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, "mov x17, #0x87\n\t" "ands w19, %w[sz], #15\n\t" "cset w16, ne\n\t" - "lsl w16, w16, #4\n\t" - "sub %w[sz], %w[sz], w16\n\t" + "sub %w[sz], %w[sz], w16, lsl 4\n\t" "ld1 {v2.2d}, [%x[i]]\n\t" "ld1 {v4.2d}, [%x[key2]]\n\t" "rev32 v2.16b, v2.16b\n\t" @@ -51831,11 +51821,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, "stp w6, w7, [%x[ks]]\n\t" "stnp w8, w9, [%x[ks], #8]\n\t" "sub %x[ks], %x[ks], #16\n\t" - "mov w3, w9\n\t" - "ubfx w6, w3, #8, #8\n\t" - "ubfx w7, w3, #16, #8\n\t" - "ubfx w8, w3, #24, #8\n\t" - "ubfx w3, w3, #0, #8\n\t" + "ubfx w6, w9, #8, #8\n\t" + "ubfx w7, w9, #16, #8\n\t" + "ubfx w8, w9, #24, #8\n\t" + "ubfx w3, w9, #0, #8\n\t" "lsl w6, w6, #2\n\t" "lsl w7, w7, #2\n\t" "lsl w8, w8, #2\n\t" @@ -55618,8 +55607,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i, "add x29, sp, #0\n\t" "ands w11, %w[sz], #15\n\t" "cset w11, ne\n\t" - "lsl w11, w11, #4\n\t" - "sub %w[sz], %w[sz], w11\n\t" + "sub %w[sz], %w[sz], w11, lsl 4\n\t" "mov x11, #0x87\n\t" "mov x28, %x[key2]\n\t" "ldp x23, x24, [%x[i]]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-chacha-asm.S index 93e9d8e635..2b621209c9 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha-asm.S +++ b/wolfcrypt/src/port/arm/armv8-chacha-asm.S @@ -1101,8 +1101,7 @@ L_chacha_use_over_arm64_byte_loop: eor w5, w5, w4 subs x3, x3, #1 strb w5, [x1], #1 - beq L_chacha_use_over_arm64_done - b L_chacha_use_over_arm64_byte_loop + bne L_chacha_use_over_arm64_byte_loop L_chacha_use_over_arm64_done: ret #ifndef __APPLE__ diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c index e440bdee64..12def7fe14 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c @@ -1024,8 +1024,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len) "eor w5, w5, w4\n\t" "subs %w[len], %w[len], #1\n\t" "strb w5, [%x[output]], #1\n\t" - "b.eq L_chacha_use_over_arm64_done_%=\n\t" - "b L_chacha_use_over_arm64_byte_loop_%=\n\t" + "b.ne L_chacha_use_over_arm64_byte_loop_%=\n\t" "\n" "L_chacha_use_over_arm64_done_%=:\n\t" : [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)