sha: Move Armv7 dispatching to C
sha256_block_data_order_hw required a bit of wrestling with Arm
immediate limits. PC-relative addressing in 32-bit Arm is a huge mess.
I think I could have avoided the extra load with a lot of effort
(convincing the assembler to evaluate a messy expression), but this is
simpler and there was no measurable performance difference.
Change-Id: I3fab4abc0fa24e0d689581e2c9b9faaa32bd7442
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64749
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index c52b546..532a81b 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,24 +197,11 @@
.code 32
#endif
-.global sha1_block_data_order
-.type sha1_block_data_order,%function
+.global sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,%function
.align 5
-sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-.Lsha1_block:
- adr r3,.Lsha1_block
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA1
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+sha1_block_data_order_nohw:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
@@ -304,17 +291,13 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha1_block_data_order,.-sha1_block_data_order
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha1_block
-#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
@@ -530,10 +513,10 @@
.arch armv7-a
.fpu neon
+.global sha1_block_data_order_neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
-.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@@ -625,10 +608,10 @@
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
-.type sha1_block_data_order_armv8,%function
+.global sha1_block_data_order_hw
+.type sha1_block_data_order_hw,%function
.align 5
-sha1_block_data_order_armv8:
-.LARMv8:
+sha1_block_data_order_hw:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
@@ -693,16 +676,10 @@
vldmia sp!,{d8-d15}
ret @ bx lr
-.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
#endif
___
}}}
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index fa82f3c..59f3417 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,34 +217,15 @@
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha256_block_data_order
-#endif
.align 5
-.global sha256_block_data_order
-.type sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
- adr r3,.Lsha256_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+.global sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- @ TODO(davidben): When the OPENSSL_armcap logic above is removed,
- @ replace this with a simple ADR.
- sub $Ktbl,r3,#256+32 @ K256
+ adr $Ktbl,K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH>=7
@@ -298,7 +279,7 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha256_block_data_order,.-sha256_block_data_order
+.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
___
######################################################################
# NEON stuff
@@ -483,10 +464,12 @@
.align 5
.skip 16
sha256_block_data_order_neon:
-.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
+ @ In Arm mode, the following ADR runs up against the limits of encodable
+ @ offsets. It only fits because the offset, when the ADR is placed here,
+ @ is a multiple of 16.
adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
@@ -613,12 +596,26 @@
# define INST(a,b,c,d) .byte a,b,c,d
# endif
-.type sha256_block_data_order_armv8,%function
+.LK256_shortcut:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word K256-(.LK256_add+4)
+#else
+.word K256-(.LK256_add+8)
+#endif
+
+.global sha256_block_data_order_hw
+.type sha256_block_data_order_hw,%function
.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
+sha256_block_data_order_hw:
+ @ K256 is too far to reference from one ADR command in Thumb mode. In
+ @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+ @ boundary. For simplicity, just load the offset from .LK256_shortcut.
+ ldr $Ktbl,.LK256_shortcut
+.LK256_add:
+ add $Ktbl,pc,$Ktbl
+
vld1.32 {$ABCD,$EFGH},[$ctx]
- sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
@@ -680,17 +677,13 @@
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
___
open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index f52b5b0..f2d1d22 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,33 +276,13 @@
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-.global sha512_block_data_order
-.type sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
- adr r3,.Lsha512_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+.global sha512_block_data_order_nohw
+.type sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
- @ TODO(davidben): When the OPENSSL_armcap logic above is removed,
- @ replace this with a simple ADR.
- sub $Ktbl,r3,#672 @ K512
+ adr $Ktbl,K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@@ -501,7 +481,7 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha512_block_data_order,.-sha512_block_data_order
+.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
___
{
@@ -612,7 +592,6 @@
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
-.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
@@ -650,10 +629,6 @@
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 28975e1..b55ea8e 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@
// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
// defined in assembly.
-#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
#define SHA1_ASM
#define SHA256_ASM
@@ -39,6 +39,35 @@
void sha512_block_data_order(uint64_t *state, const uint8_t *data,
size_t num_blocks);
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
+ size_t num);
+
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
#define SHA1_ASM_NOHW
@@ -148,6 +177,7 @@
void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
size_t num);
#endif
+
#if defined(SHA512_ASM_NOHW)
void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 7b267e3..7a97266 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,6 +410,12 @@
return;
}
#endif
+#if defined(SHA1_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha1_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha1_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 0b0aca2..8cedc5f 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,6 +332,12 @@
return;
}
#endif
+#if defined(SHA256_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha256_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha256_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 0f4142c..d31ab71 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,6 +516,12 @@
return;
}
#endif
+#if defined(SHA512_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha512_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha512_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 22856f8..671c170 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,6 +75,11 @@
return;
}
#endif
+#if defined(SHA1_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA1_ASM_NOHW)
CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@@ -107,6 +112,11 @@
return;
}
#endif
+#if defined(SHA256_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA256_ASM_NOHW)
CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@@ -132,6 +142,11 @@
CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
+#if defined(SHA512_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA512_ASM_NOHW)
CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif