Revert "sha: Move Armv7 dispatching to C" google3 is unhappy with this and needs some more love on where the adr target gets placed to fit. This reverts commit 62f43f5ea57b9b208fc784e5fa959bce89ebd718. Change-Id: I1e335c635590fdda72a8a98314a1640d5b7ea179 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65328 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Bob Beck <bbe@google.com> Auto-Submit: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl index 532a81b..c52b546 100644 --- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl +++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,11 +197,24 @@ .code 32 #endif -.global sha1_block_data_order_nohw -.type sha1_block_data_order_nohw,%function +.global sha1_block_data_order +.type sha1_block_data_order,%function .align 5 -sha1_block_data_order_nohw: +sha1_block_data_order: +#if __ARM_MAX_ARCH__>=7 +.Lsha1_block: + adr r3,.Lsha1_block + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA1 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif stmdb sp!,{r4-r12,lr} add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp ldmia $ctx,{$a,$b,$c,$d,$e} @@ -291,13 +304,17 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw +.size sha1_block_data_order,.-sha1_block_data_order .align 5 .LK_00_19: .word 0x5a827999 .LK_20_39: .word 0x6ed9eba1 .LK_40_59: .word 0x8f1bbcdc .LK_60_79: .word 0xca62c1d6 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha1_block +#endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 5 ___ @@ -513,10 +530,10 @@ .arch armv7-a .fpu neon -.global sha1_block_data_order_neon .type sha1_block_data_order_neon,%function .align 4 sha1_block_data_order_neon: +.LNEON: stmdb sp!,{r4-r12,lr} add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp @ dmb @ errata #451034 on early Cortex A8 @@ -608,10 +625,10 @@ # define INST(a,b,c,d) .byte a,b,c,d|0x10 # endif -.global sha1_block_data_order_hw -.type sha1_block_data_order_hw,%function +.type sha1_block_data_order_armv8,%function .align 5 -sha1_block_data_order_hw: +sha1_block_data_order_armv8: +.LARMv8: vstmdb sp!,{d8-d15} @ ABI specification says so veor $E,$E,$E @@ -676,10 +693,16 @@ vldmia sp!,{d8-d15} ret @ bx lr -.size sha1_block_data_order_hw,.-sha1_block_data_order_hw +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 #endif ___ }}} +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +___ { my %opcode = ( "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl index 59f3417..fa82f3c 100644 --- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,15 +217,34 @@ .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha256_block_data_order +#endif .align 5 -.global sha256_block_data_order_nohw -.type sha256_block_data_order_nohw,%function -sha256_block_data_order_nohw: +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +.Lsha256_block_data_order: + adr r3,.Lsha256_block_data_order +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif add $len,$inp,$len,lsl#6 @ len to point at the end of inp stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} - adr $Ktbl,K256 + @ TODO(davidben): When the OPENSSL_armcap logic above is removed, + @ replace this with a simple ADR. + sub $Ktbl,r3,#256+32 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH>=7 @@ -279,7 +298,7 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +.size sha256_block_data_order,.-sha256_block_data_order ___ ###################################################################### # NEON stuff @@ -464,12 +483,10 @@ .align 5 .skip 16 sha256_block_data_order_neon: +.LNEON: stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 - @ In Arm mode, the following ADR runs up against the limits of encodable - @ offsets. It only fits because the offset, when the ADR is placed here, - @ is a multiple of 16. adr $Ktbl,K256 bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp @@ -596,26 +613,12 @@ # define INST(a,b,c,d) .byte a,b,c,d # endif -.LK256_shortcut: -@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. -#if defined(__thumb2__) -.word K256-(.LK256_add+4) -#else -.word K256-(.LK256_add+8) -#endif - -.global sha256_block_data_order_hw -.type sha256_block_data_order_hw,%function +.type sha256_block_data_order_armv8,%function .align 5 -sha256_block_data_order_hw: - @ K256 is too far to reference from one ADR command in Thumb mode. In - @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte - @ boundary. For simplicity, just load the offset from .LK256_shortcut. - ldr $Ktbl,.LK256_shortcut -.LK256_add: - add $Ktbl,pc,$Ktbl - +sha256_block_data_order_armv8: +.LARMv8: vld1.32 {$ABCD,$EFGH},[$ctx] + sub $Ktbl,$Ktbl,#256+32 add $len,$inp,$len,lsl#6 @ len to point at the end of inp b .Loop_v8 @@ -677,13 +680,17 @@ vst1.32 {$ABCD,$EFGH},[$ctx] ret @ bx lr -.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 #endif ___ }}} $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif ___ open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl index f2d1d22..f52b5b0 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,13 +276,33 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lsha512_block_data_order +.skip 32-4 +#else +.skip 32 +#endif -.global sha512_block_data_order_nohw -.type sha512_block_data_order_nohw,%function -sha512_block_data_order_nohw: +.global sha512_block_data_order +.type sha512_block_data_order,%function +sha512_block_data_order: +.Lsha512_block_data_order: + adr r3,.Lsha512_block_data_order +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + bne .LNEON +#endif add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} - adr $Ktbl,K512 + @ TODO(davidben): When the OPENSSL_armcap logic above is removed, + @ replace this with a simple ADR. + sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 ldr $Elo,[$ctx,#$Eoff+$lo] @@ -481,7 +501,7 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +.size sha512_block_data_order,.-sha512_block_data_order ___ { @@ -592,6 +612,7 @@ .type sha512_block_data_order_neon,%function .align 4 sha512_block_data_order_neon: +.LNEON: dmb @ errata #451034 on early Cortex A8 add $len,$inp,$len,lsl#7 @ len to point at the end of inp adr $Ktbl,K512 @@ -629,6 +650,10 @@ $code.=<<___; .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h index 0c05d73..7c15b2c 100644 --- a/crypto/fipsmodule/sha/internal.h +++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@ // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is // defined in assembly. -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) +#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM)) #define SHA1_ASM #define SHA256_ASM @@ -39,35 +39,6 @@ void sha512_block_data_order(uint64_t *state, const uint8_t *data, size_t num_blocks); -#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) - -#define SHA1_ASM_NOHW -#define SHA256_ASM_NOHW -#define SHA512_ASM_NOHW - -#define SHA1_ASM_HW -OPENSSL_INLINE int sha1_hw_capable(void) { - return CRYPTO_is_ARMv8_SHA1_capable(); -} - -#define SHA1_ASM_NEON -void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data, - size_t num); - -#define SHA256_ASM_HW -OPENSSL_INLINE int sha256_hw_capable(void) { - return CRYPTO_is_ARMv8_SHA256_capable(); -} - -#define SHA256_ASM_NEON -void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data, - size_t num); - -// Armv8.2 SHA-512 instructions are not available in 32-bit. -#define SHA512_ASM_NEON -void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data, - size_t num); - #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) #define SHA1_ASM_NOHW @@ -178,7 +149,6 @@ void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data, size_t num); #endif - #if defined(SHA512_ASM_NOHW) void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data, size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c index 7a97266..7b267e3 100644 --- a/crypto/fipsmodule/sha/sha1.c +++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,12 +410,6 @@ return; } #endif -#if defined(SHA1_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - sha1_block_data_order_neon(state, data, num); - return; - } -#endif sha1_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c index 8cedc5f..0b0aca2 100644 --- a/crypto/fipsmodule/sha/sha256.c +++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,12 +332,6 @@ return; } #endif -#if defined(SHA256_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - sha256_block_data_order_neon(state, data, num); - return; - } -#endif sha256_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c index d31ab71..0f4142c 100644 --- a/crypto/fipsmodule/sha/sha512.c +++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,12 +516,6 @@ return; } #endif -#if defined(SHA512_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - sha512_block_data_order_neon(state, data, num); - return; - } -#endif sha512_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc index 671c170..22856f8 100644 --- a/crypto/fipsmodule/sha/sha_test.cc +++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,11 +75,6 @@ return; } #endif -#if defined(SHA1_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks); - } -#endif #if defined(SHA1_ASM_NOHW) CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks); #endif @@ -112,11 +107,6 @@ return; } #endif -#if defined(SHA256_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks); - } -#endif #if defined(SHA256_ASM_NOHW) CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks); #endif @@ -142,11 +132,6 @@ CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks); } #endif -#if defined(SHA512_ASM_NEON) - if (CRYPTO_is_NEON_capable()) { - CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks); - } -#endif #if defined(SHA512_ASM_NOHW) CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks); #endif