sha: Move Armv7 dispatching to C (reland)
This is a reland of
https://boringssl-review.googlesource.com/c/boringssl/+/64749, which was
reverted in
https://boringssl-review.googlesource.com/c/boringssl/+/65328 due to
issues in Arm mode (i.e. not Thumb mode) builds that target Armv6+
instead of Armv7+.
The issue was that sha256_block_data_order_nohw has slightly different
sizes depending on __ARM_ARCH. Prior to moving the dispatch, the sizes
worked out such that they were always encodable in ADR. After moving the
dispatch, the instructions got shorter, such that the Armv7+ build still
worked, but the Armv6+ build needed to encode an offset of 0x1060
(previously 0x1080), which does not fit.
See https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ for
details on Arm's very fussy immediate value encoding. It's not the only
form used for ADR (Thumb2 works very differently), but it's the
applicable one here.
While we could shuffle things around, this is all far too fragile. Just
use the LDR; ADD pattern we used for the other function. ADRL would
avoid a load (it splits the offset into two constants without a constant
bank), but that's a pseudo-instruction that's only supported by gas.
clang-assembler didn't want to implement it. Android have a macro at
https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm,
but it didn't work for me when I tried it. Also, searching around, it
sounds like ADRL in gas only works in Arm mode and not Thumb mode?
We could probably work through all that, but the compiler emits constant
banks on 32-bit Arm all the time. (I got this pattern from Clang's
output.) This is probably not worth the trouble.
Bug: 673
Change-Id: I165544764a931b293aa66fb3fc9bb8f01eeb8092
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65808
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index c52b546..532a81b 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,24 +197,11 @@
.code 32
#endif
-.global sha1_block_data_order
-.type sha1_block_data_order,%function
+.global sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,%function
.align 5
-sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-.Lsha1_block:
- adr r3,.Lsha1_block
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA1
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+sha1_block_data_order_nohw:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
@@ -304,17 +291,13 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha1_block_data_order,.-sha1_block_data_order
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha1_block
-#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
@@ -530,10 +513,10 @@
.arch armv7-a
.fpu neon
+.global sha1_block_data_order_neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
-.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@@ -625,10 +608,10 @@
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
-.type sha1_block_data_order_armv8,%function
+.global sha1_block_data_order_hw
+.type sha1_block_data_order_hw,%function
.align 5
-sha1_block_data_order_armv8:
-.LARMv8:
+sha1_block_data_order_hw:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
@@ -693,16 +676,10 @@
vldmia sp!,{d8-d15}
ret @ bx lr
-.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
#endif
___
}}}
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
-___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index fa82f3c..99b8b2a 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,34 +217,15 @@
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha256_block_data_order
-#endif
.align 5
-.global sha256_block_data_order
-.type sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
- adr r3,.Lsha256_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+.global sha256_block_data_order_nohw
+.type sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- @ TODO(davidben): When the OPENSSL_armcap logic above is removed,
- @ replace this with a simple ADR.
- sub $Ktbl,r3,#256+32 @ K256
+ adr $Ktbl,K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH>=7
@@ -298,7 +279,7 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha256_block_data_order,.-sha256_block_data_order
+.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
___
######################################################################
# NEON stuff
@@ -478,16 +459,37 @@
.arch armv7-a
.fpu neon
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word K256-(.LK256_add_neon+4)
+#else
+.word K256-(.LK256_add_neon+8)
+#endif
+
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 5
.skip 16
sha256_block_data_order_neon:
-.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
- adr $Ktbl,K256
+
+ @ K256 is just at the boundary of being easily referenced by an ADR from
+ @ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+ @ not fit. By moving code around, we could make it fit, but this is too
+ @ fragile. For simplicity, just load the offset from
+ @ .LK256_shortcut_neon.
+ @
+ @ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+ @ support it. We might be able to emulate it with a macro, but Android's
+ @ did not work when I tried it.
+ @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
+ ldr $Ktbl,.LK256_shortcut_neon
+.LK256_add_neon:
+ add $Ktbl,pc,$Ktbl
+
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
@@ -613,12 +615,26 @@
# define INST(a,b,c,d) .byte a,b,c,d
# endif
-.type sha256_block_data_order_armv8,%function
+.LK256_shortcut_hw:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word K256-(.LK256_add_hw+4)
+#else
+.word K256-(.LK256_add_hw+8)
+#endif
+
+.global sha256_block_data_order_hw
+.type sha256_block_data_order_hw,%function
.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
+sha256_block_data_order_hw:
+ @ K256 is too far to reference from one ADR command in Thumb mode. In
+ @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+ @ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
+ ldr $Ktbl,.LK256_shortcut_hw
+.LK256_add_hw:
+ add $Ktbl,pc,$Ktbl
+
vld1.32 {$ABCD,$EFGH},[$ctx]
- sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
@@ -680,17 +696,13 @@
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
___
open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index f52b5b0..f2d1d22 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,33 +276,13 @@
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lsha512_block_data_order
-.skip 32-4
-#else
-.skip 32
-#endif
-.global sha512_block_data_order
-.type sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
- adr r3,.Lsha512_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
- ldr r12,[r12]
-#endif
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
+.global sha512_block_data_order_nohw
+.type sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
- @ TODO(davidben): When the OPENSSL_armcap logic above is removed,
- @ replace this with a simple ADR.
- sub $Ktbl,r3,#672 @ K512
+ adr $Ktbl,K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@@ -501,7 +481,7 @@
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
-.size sha512_block_data_order,.-sha512_block_data_order
+.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
___
{
@@ -612,7 +592,6 @@
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
-.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
@@ -650,10 +629,6 @@
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 7c15b2c..0c05d73 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@
// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
// defined in assembly.
-#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
#define SHA1_ASM
#define SHA256_ASM
@@ -39,6 +39,35 @@
void sha512_block_data_order(uint64_t *state, const uint8_t *data,
size_t num_blocks);
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
+ size_t num);
+
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
#define SHA1_ASM_NOHW
@@ -149,6 +178,7 @@
void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
size_t num);
#endif
+
#if defined(SHA512_ASM_NOHW)
void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 7b267e3..7a97266 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,6 +410,12 @@
return;
}
#endif
+#if defined(SHA1_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha1_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha1_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 0b0aca2..8cedc5f 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,6 +332,12 @@
return;
}
#endif
+#if defined(SHA256_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha256_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha256_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 0f4142c..d31ab71 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,6 +516,12 @@
return;
}
#endif
+#if defined(SHA512_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ sha512_block_data_order_neon(state, data, num);
+ return;
+ }
+#endif
sha512_block_data_order_nohw(state, data, num);
}
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 22856f8..671c170 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,6 +75,11 @@
return;
}
#endif
+#if defined(SHA1_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA1_ASM_NOHW)
CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@@ -107,6 +112,11 @@
return;
}
#endif
+#if defined(SHA256_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA256_ASM_NOHW)
CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
@@ -132,6 +142,11 @@
CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
+#if defined(SHA512_ASM_NEON)
+ if (CRYPTO_is_NEON_capable()) {
+ CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+ }
+#endif
#if defined(SHA512_ASM_NOHW)
CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif