sha: Move Armv7 dispatching to C sha256_block_data_order_hw required a bit of wrestling with Arm immediate limits. PC-relative addressing in 32-bit Arm is a huge mess. I think I could have avoided the extra load with a lot of effort (convincing the assembler to evaluate a messy expression), but this is simpler and there was no measurable performance difference. Change-Id: I3fab4abc0fa24e0d689581e2c9b9faaa32bd7442 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64749 Commit-Queue: Bob Beck <bbe@google.com> Auto-Submit: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl index c52b546..532a81b 100644 --- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl +++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,24 +197,11 @@ .code 32 #endif -.global sha1_block_data_order -.type sha1_block_data_order,%function +.global sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,%function .align 5 -sha1_block_data_order: -#if __ARM_MAX_ARCH__>=7 -.Lsha1_block: - adr r3,.Lsha1_block - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA1 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif +sha1_block_data_order_nohw: stmdb sp!,{r4-r12,lr} add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp ldmia $ctx,{$a,$b,$c,$d,$e} @@ -304,17 +291,13 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha1_block_data_order,.-sha1_block_data_order +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw .align 5 .LK_00_19: .word 0x5a827999 .LK_20_39: .word 0x6ed9eba1 .LK_40_59: .word 0x8f1bbcdc .LK_60_79: .word 0xca62c1d6 -#if __ARM_MAX_ARCH__>=7 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha1_block -#endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 5 ___ @@ -530,10 +513,10 @@ .arch armv7-a .fpu neon +.global sha1_block_data_order_neon .type sha1_block_data_order_neon,%function .align 4 sha1_block_data_order_neon: -.LNEON: stmdb sp!,{r4-r12,lr} add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp @ dmb @ errata #451034 on early Cortex A8 @@ -625,10 +608,10 @@ # define INST(a,b,c,d) .byte a,b,c,d|0x10 # endif -.type sha1_block_data_order_armv8,%function +.global sha1_block_data_order_hw +.type sha1_block_data_order_hw,%function .align 5 -sha1_block_data_order_armv8: -.LARMv8: +sha1_block_data_order_hw: vstmdb sp!,{d8-d15} @ ABI specification says so veor $E,$E,$E @@ -693,16 +676,10 @@ vldmia sp!,{d8-d15} ret @ bx lr -.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw #endif ___ }}} -$code.=<<___; -#if __ARM_MAX_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -___ { my %opcode = ( "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl index fa82f3c..59f3417 100644 --- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,34 +217,15 @@ .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256,.-K256 .word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha256_block_data_order -#endif .align 5 -.global sha256_block_data_order -.type sha256_block_data_order,%function -sha256_block_data_order: -.Lsha256_block_data_order: - adr r3,.Lsha256_block_data_order -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA256 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif +.global sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +sha256_block_data_order_nohw: add $len,$inp,$len,lsl#6 @ len to point at the end of inp stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} - @ TODO(davidben): When the OPENSSL_armcap logic above is removed, - @ replace this with a simple ADR. - sub $Ktbl,r3,#256+32 @ K256 + adr $Ktbl,K256 sub sp,sp,#16*4 @ alloca(X[16]) .Loop: # if __ARM_ARCH>=7 @@ -298,7 +279,7 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha256_block_data_order,.-sha256_block_data_order +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw ___ ###################################################################### # NEON stuff @@ -483,10 +464,12 @@ .align 5 .skip 16 sha256_block_data_order_neon: -.LNEON: stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 + @ In Arm mode, the following ADR runs up against the limits of encodable + @ offsets. It only fits because the offset, when the ADR is placed here, + @ is a multiple of 16. adr $Ktbl,K256 bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp @@ -613,12 +596,26 @@ # define INST(a,b,c,d) .byte a,b,c,d # endif -.type sha256_block_data_order_armv8,%function +.LK256_shortcut: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add+4) +#else +.word K256-(.LK256_add+8) +#endif + +.global sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function .align 5 -sha256_block_data_order_armv8: -.LARMv8: +sha256_block_data_order_hw: + @ K256 is too far to reference from one ADR command in Thumb mode. In + @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte + @ boundary. For simplicity, just load the offset from .LK256_shortcut. + ldr $Ktbl,.LK256_shortcut +.LK256_add: + add $Ktbl,pc,$Ktbl + vld1.32 {$ABCD,$EFGH},[$ctx] - sub $Ktbl,$Ktbl,#256+32 add $len,$inp,$len,lsl#6 @ len to point at the end of inp b .Loop_v8 @@ -680,17 +677,13 @@ vst1.32 {$ABCD,$EFGH},[$ctx] ret @ bx lr -.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw #endif ___ }}} $code.=<<___; .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif ___ open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl index f52b5b0..f2d1d22 100644 --- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,33 +276,13 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha512_block_data_order -.skip 32-4 -#else -.skip 32 -#endif -.global sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: -.Lsha512_block_data_order: - adr r3,.Lsha512_block_data_order -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV7_NEON - bne .LNEON -#endif +.global sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +sha512_block_data_order_nohw: add $len,$inp,$len,lsl#7 @ len to point at the end of inp stmdb sp!,{r4-r12,lr} - @ TODO(davidben): When the OPENSSL_armcap logic above is removed, - @ replace this with a simple ADR. - sub $Ktbl,r3,#672 @ K512 + adr $Ktbl,K512 sub sp,sp,#9*8 ldr $Elo,[$ctx,#$Eoff+$lo] @@ -501,7 +481,7 @@ moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif -.size sha512_block_data_order,.-sha512_block_data_order +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw ___ { @@ -612,7 +592,6 @@ .type sha512_block_data_order_neon,%function .align 4 sha512_block_data_order_neon: -.LNEON: dmb @ errata #451034 on early Cortex A8 add $len,$inp,$len,lsl#7 @ len to point at the end of inp adr $Ktbl,K512 @@ -650,10 +629,6 @@ $code.=<<___; .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif ___ $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h index 28975e1..b55ea8e 100644 --- a/crypto/fipsmodule/sha/internal.h +++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@ // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is // defined in assembly. -#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM)) +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) #define SHA1_ASM #define SHA256_ASM @@ -39,6 +39,35 @@ void sha512_block_data_order(uint64_t *state, const uint8_t *data, size_t num_blocks); +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +OPENSSL_INLINE int sha1_hw_capable(void) { + return CRYPTO_is_ARMv8_SHA1_capable(); +} + +#define SHA1_ASM_NEON +void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data, + size_t num); + +#define SHA256_ASM_HW +OPENSSL_INLINE int sha256_hw_capable(void) { + return CRYPTO_is_ARMv8_SHA256_capable(); +} + +#define SHA256_ASM_NEON +void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data, + size_t num); + +// Armv8.2 SHA-512 instructions are not available in 32-bit. +#define SHA512_ASM_NEON +void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data, + size_t num); + #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) #define SHA1_ASM_NOHW @@ -148,6 +177,7 @@ void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data, size_t num); #endif + #if defined(SHA512_ASM_NOHW) void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data, size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c index 7b267e3..7a97266 100644 --- a/crypto/fipsmodule/sha/sha1.c +++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,6 +410,12 @@ return; } #endif +#if defined(SHA1_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha1_block_data_order_neon(state, data, num); + return; + } +#endif sha1_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c index 0b0aca2..8cedc5f 100644 --- a/crypto/fipsmodule/sha/sha256.c +++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,6 +332,12 @@ return; } #endif +#if defined(SHA256_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha256_block_data_order_neon(state, data, num); + return; + } +#endif sha256_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c index 0f4142c..d31ab71 100644 --- a/crypto/fipsmodule/sha/sha512.c +++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,6 +516,12 @@ return; } #endif +#if defined(SHA512_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha512_block_data_order_neon(state, data, num); + return; + } +#endif sha512_block_data_order_nohw(state, data, num); }
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc index 22856f8..671c170 100644 --- a/crypto/fipsmodule/sha/sha_test.cc +++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,6 +75,11 @@ return; } #endif +#if defined(SHA1_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks); + } +#endif #if defined(SHA1_ASM_NOHW) CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks); #endif @@ -107,6 +112,11 @@ return; } #endif +#if defined(SHA256_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks); + } +#endif #if defined(SHA256_ASM_NOHW) CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks); #endif @@ -132,6 +142,11 @@ CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks); } #endif +#if defined(SHA512_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks); + } +#endif #if defined(SHA512_ASM_NOHW) CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks); #endif