sha: Move Armv7 dispatching to C (reland) This is a reland of https://boringssl-review.googlesource.com/c/boringssl/+/64749, which was reverted in https://boringssl-review.googlesource.com/c/boringssl/+/65328 due to issues in Arm mode (i.e. not Thumb mode) builds that target Armv6+ instead of Armv7+. The issue was that sha256_block_data_order_nohw has slightly different sizes depending on __ARM_ARCH. Prior to moving the dispatch, the sizes worked out such that they were always encodable in ADR. After moving the dispatch, the instructions got shorter, such that the Armv7+ build still worked, but the Armv6+ build needed to encode an offset of 0x1060 (previously 0x1080), which does not fit. See https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ for details on Arm's very fussy immediate value encoding. It's not the only form used for ADR (Thumb2 works very differently), but it's the applicable one here. While we could shuffle things around, this is all far too fragile. Just use the LDR; ADD pattern we used for the other function. ADRL would avoid a load (it splits the offset into two constants without a constant bank), but that's a pseudo-instruction that's only supported by gas. clang-assembler didn't want to implement it. Android have a macro at https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm, but it didn't work for me when I tried it. Also, searching around, it sounds like ADRL in gas only works in Arm mode and not Thumb mode? We could probably work through all that, but the compiler emits constant banks on 32-bit Arm all the time. (I got this pattern from Clang's output.) This is probably not worth the trouble. Bug: 673 Change-Id: I165544764a931b293aa66fb3fc9bb8f01eeb8092 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65808 Commit-Queue: Bob Beck <bbe@google.com> Auto-Submit: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>

commit: 12316ab445eef5317391a94bef733fa6ff175173 [log] [tgz]
author: David Benjamin <davidben@google.com> Mon Dec 11 21:23:27 2023 -0500
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri Jan 26 18:15:01 2024 +0000
tree: e2e8fc69cd888477eb155c68279e2ad32c5b0d0d
parent: 7cb8df579329b70cd4ede09d6d228636b8e31e89 [diff]
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index c52b546..532a81b 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl

@@ -197,24 +197,11 @@
 .code	32
 #endif
 
-.global	sha1_block_data_order
-.type	sha1_block_data_order,%function
+.global	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
 
 .align	5
-sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-.Lsha1_block:
-	adr	r3,.Lsha1_block
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA1
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+sha1_block_data_order_nohw:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	ldmia	$ctx,{$a,$b,$c,$d,$e}
@@ -304,17 +291,13 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha1_block_data_order,.-sha1_block_data_order
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
 
 .align	5
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
 .LK_40_59:	.word	0x8f1bbcdc
 .LK_60_79:	.word	0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha1_block
-#endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 ___
@@ -530,10 +513,10 @@
 .arch	armv7-a
 .fpu	neon
 
+.global	sha1_block_data_order_neon
 .type	sha1_block_data_order_neon,%function
 .align	4
 sha1_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	@ dmb				@ errata #451034 on early Cortex A8
@@ -625,10 +608,10 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
 # endif
 
-.type	sha1_block_data_order_armv8,%function
+.global	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
 .align	5
-sha1_block_data_order_armv8:
-.LARMv8:
+sha1_block_data_order_hw:
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 
 	veor	$E,$E,$E
@@ -693,16 +676,10 @@
 
 	vldmia	sp!,{d8-d15}
 	ret					@ bx lr
-.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 #endif
 ___
 }}}
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-___
 
 {   my  %opcode = (
 	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,

diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index fa82f3c..99b8b2a 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl

@@ -217,34 +217,15 @@
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha256_block_data_order
-#endif
 .align	5
 
-.global	sha256_block_data_order
-.type	sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-	adr	r3,.Lsha256_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA256
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#256+32	@ K256
+	adr	$Ktbl,K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH>=7
@@ -298,7 +279,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_block_data_order,.-sha256_block_data_order
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
 ___
 ######################################################################
 # NEON stuff
@@ -478,16 +459,37 @@
 .arch	armv7-a
 .fpu	neon
 
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_neon+4)
+#else
+.word	K256-(.LK256_add_neon+8)
+#endif
+
 .global	sha256_block_data_order_neon
 .type	sha256_block_data_order_neon,%function
 .align	5
 .skip	16
 sha256_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
-	adr	$Ktbl,K256
+
+	@ K256 is just at the boundary of being easily referenced by an ADR from
+	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+	@ not fit. By moving code around, we could make it fit, but this is too
+	@ fragile. For simplicity, just load the offset from
+	@ .LK256_shortcut_neon.
+	@
+	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+	@ support it. We might be able to emulate it with a macro, but Android's
+	@ did not work when I tried it.
+	@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
+	ldr	$Ktbl,.LK256_shortcut_neon
+.LK256_add_neon:
+	add	$Ktbl,pc,$Ktbl
+
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
 	mov	sp,$H			@ alloca
@@ -613,12 +615,26 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif
 
-.type	sha256_block_data_order_armv8,%function
+.LK256_shortcut_hw:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_hw+4)
+#else
+.word	K256-(.LK256_add_hw+8)
+#endif
+
+.global	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
 .align	5
-sha256_block_data_order_armv8:
-.LARMv8:
+sha256_block_data_order_hw:
+	@ K256 is too far to reference from one ADR command in Thumb mode. In
+	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+	@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
+	ldr	$Ktbl,.LK256_shortcut_hw
+.LK256_add_hw:
+	add	$Ktbl,pc,$Ktbl
+
 	vld1.32	{$ABCD,$EFGH},[$ctx]
-	sub	$Ktbl,$Ktbl,#256+32
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8
 
@@ -680,17 +696,13 @@
 	vst1.32		{$ABCD,$EFGH},[$ctx]
 
 	ret		@ bx lr
-.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 #endif
 ___
 }}}
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
 ___
 
 open SELF,$0;

diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index f52b5b0..f2d1d22 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl

@@ -276,33 +276,13 @@
 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha512_block_data_order
-.skip	32-4
-#else
-.skip	32
-#endif
 
-.global	sha512_block_data_order
-.type	sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-	adr	r3,.Lsha512_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4-r12,lr}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#672		@ K512
+	adr	$Ktbl,K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -501,7 +481,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha512_block_data_order,.-sha512_block_data_order
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
 ___
 
 {
@@ -612,7 +592,6 @@
 .type	sha512_block_data_order_neon,%function
 .align	4
 sha512_block_data_order_neon:
-.LNEON:
 	dmb				@ errata #451034 on early Cortex A8
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	adr	$Ktbl,K512
@@ -650,10 +629,6 @@
 $code.=<<___;
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;

diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 7c15b2c..0c05d73 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h

@@ -26,7 +26,7 @@
 // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
 // defined in assembly.
 
-#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
 
 #define SHA1_ASM
 #define SHA256_ASM
@@ -39,6 +39,35 @@
 void sha512_block_data_order(uint64_t *state, const uint8_t *data,
                              size_t num_blocks);
 
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                  size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
+                                  size_t num);
+
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
 
 #define SHA1_ASM_NOHW
@@ -149,6 +178,7 @@
 void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
                                 size_t num);
 #endif
+
 #if defined(SHA512_ASM_NOHW)
 void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
                                   size_t num);

diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 7b267e3..7a97266 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c

@@ -410,6 +410,12 @@
     return;
   }
 #endif
+#if defined(SHA1_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha1_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha1_block_data_order_nohw(state, data, num);
 }
 

diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 0b0aca2..8cedc5f 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c

@@ -332,6 +332,12 @@
     return;
   }
 #endif
+#if defined(SHA256_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha256_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha256_block_data_order_nohw(state, data, num);
 }
 

diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 0f4142c..d31ab71 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c

@@ -516,6 +516,12 @@
     return;
   }
 #endif
+#if defined(SHA512_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha512_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha512_block_data_order_nohw(state, data, num);
 }
 

diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 22856f8..671c170 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc

@@ -75,6 +75,11 @@
       return;
     }
 #endif
+#if defined(SHA1_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA1_ASM_NOHW)
     CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -107,6 +112,11 @@
       return;
     }
 #endif
+#if defined(SHA256_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA256_ASM_NOHW)
     CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -132,6 +142,11 @@
       CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
     }
 #endif
+#if defined(SHA512_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA512_ASM_NOHW)
     CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
commit	12316ab445eef5317391a94bef733fa6ff175173	[log] [tgz]
author	David Benjamin <davidben@google.com>	Mon Dec 11 21:23:27 2023 -0500
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri Jan 26 18:15:01 2024 +0000
tree	e2e8fc69cd888477eb155c68279e2ad32c5b0d0d
parent	7cb8df579329b70cd4ede09d6d228636b8e31e89 [diff]