sha: Move Armv7 dispatching to C

sha256_block_data_order_hw required a bit of wrestling with Arm
immediate limits. PC-relative addressing in 32-bit Arm is a huge mess.
I think I could have avoided the extra load with a lot of effort
(convincing the assembler to evaluate a messy expression), but this is
simpler and there was no measurable performance difference.

Change-Id: I3fab4abc0fa24e0d689581e2c9b9faaa32bd7442
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64749
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index c52b546..532a81b 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,24 +197,11 @@
 .code	32
 #endif
 
-.global	sha1_block_data_order
-.type	sha1_block_data_order,%function
+.global	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
 
 .align	5
-sha1_block_data_order:
-#if __ARM_MAX_ARCH__>=7
-.Lsha1_block:
-	adr	r3,.Lsha1_block
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA1
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+sha1_block_data_order_nohw:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	ldmia	$ctx,{$a,$b,$c,$d,$e}
@@ -304,17 +291,13 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha1_block_data_order,.-sha1_block_data_order
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
 
 .align	5
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
 .LK_40_59:	.word	0x8f1bbcdc
 .LK_60_79:	.word	0xca62c1d6
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha1_block
-#endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 ___
@@ -530,10 +513,10 @@
 .arch	armv7-a
 .fpu	neon
 
+.global	sha1_block_data_order_neon
 .type	sha1_block_data_order_neon,%function
 .align	4
 sha1_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	@ dmb				@ errata #451034 on early Cortex A8
@@ -625,10 +608,10 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
 # endif
 
-.type	sha1_block_data_order_armv8,%function
+.global	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
 .align	5
-sha1_block_data_order_armv8:
-.LARMv8:
+sha1_block_data_order_hw:
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 
 	veor	$E,$E,$E
@@ -693,16 +676,10 @@
 
 	vldmia	sp!,{d8-d15}
 	ret					@ bx lr
-.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 #endif
 ___
 }}}
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-___
 
 {   my  %opcode = (
 	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index fa82f3c..59f3417 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,34 +217,15 @@
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha256_block_data_order
-#endif
 .align	5
 
-.global	sha256_block_data_order
-.type	sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-	adr	r3,.Lsha256_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV8_SHA256
-	bne	.LARMv8
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#256+32	@ K256
+	adr	$Ktbl,K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH>=7
@@ -298,7 +279,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_block_data_order,.-sha256_block_data_order
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
 ___
 ######################################################################
 # NEON stuff
@@ -483,10 +464,12 @@
 .align	5
 .skip	16
 sha256_block_data_order_neon:
-.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
+	@ In Arm mode, the following ADR runs up against the limits of encodable
+	@ offsets. It only fits because the offset, when the ADR is placed here,
+	@ is a multiple of 16.
 	adr	$Ktbl,K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
@@ -613,12 +596,26 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif
 
-.type	sha256_block_data_order_armv8,%function
+.LK256_shortcut:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add+4)
+#else
+.word	K256-(.LK256_add+8)
+#endif
+
+.global	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
 .align	5
-sha256_block_data_order_armv8:
-.LARMv8:
+sha256_block_data_order_hw:
+	@ K256 is too far to reference from one ADR command in Thumb mode. In
+	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+	@ boundary. For simplicity, just load the offset from .LK256_shortcut.
+	ldr	$Ktbl,.LK256_shortcut
+.LK256_add:
+	add	$Ktbl,pc,$Ktbl
+
 	vld1.32	{$ABCD,$EFGH},[$ctx]
-	sub	$Ktbl,$Ktbl,#256+32
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8
 
@@ -680,17 +677,13 @@
 	vst1.32		{$ABCD,$EFGH},[$ctx]
 
 	ret		@ bx lr
-.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 #endif
 ___
 }}}
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
-#endif
 ___
 
 open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index f52b5b0..f2d1d22 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,33 +276,13 @@
 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lsha512_block_data_order
-.skip	32-4
-#else
-.skip	32
-#endif
 
-.global	sha512_block_data_order
-.type	sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-	adr	r3,.Lsha512_block_data_order
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
-#ifdef	__APPLE__
-	ldr	r12,[r12]
-#endif
-	tst	r12,#ARMV7_NEON
-	bne	.LNEON
-#endif
+.global	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4-r12,lr}
-	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
-	@ replace this with a simple ADR.
-	sub	$Ktbl,r3,#672		@ K512
+	adr	$Ktbl,K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -501,7 +481,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha512_block_data_order,.-sha512_block_data_order
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
 ___
 
 {
@@ -612,7 +592,6 @@
 .type	sha512_block_data_order_neon,%function
 .align	4
 sha512_block_data_order_neon:
-.LNEON:
 	dmb				@ errata #451034 on early Cortex A8
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	adr	$Ktbl,K512
@@ -650,10 +629,6 @@
 $code.=<<___;
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 28975e1..b55ea8e 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@
 // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
 // defined in assembly.
 
-#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
 
 #define SHA1_ASM
 #define SHA256_ASM
@@ -39,6 +39,35 @@
 void sha512_block_data_order(uint64_t *state, const uint8_t *data,
                              size_t num_blocks);
 
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
+                                  size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
+                                  size_t num);
+
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
 
 #define SHA1_ASM_NOHW
@@ -148,6 +177,7 @@
 void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
                                 size_t num);
 #endif
+
 #if defined(SHA512_ASM_NOHW)
 void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
                                   size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 7b267e3..7a97266 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,6 +410,12 @@
     return;
   }
 #endif
+#if defined(SHA1_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha1_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha1_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 0b0aca2..8cedc5f 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,6 +332,12 @@
     return;
   }
 #endif
+#if defined(SHA256_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha256_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha256_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 0f4142c..d31ab71 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,6 +516,12 @@
     return;
   }
 #endif
+#if defined(SHA512_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha512_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
   sha512_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 22856f8..671c170 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,6 +75,11 @@
       return;
     }
 #endif
+#if defined(SHA1_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA1_ASM_NOHW)
     CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -107,6 +112,11 @@
       return;
     }
 #endif
+#if defined(SHA256_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA256_ASM_NOHW)
     CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -132,6 +142,11 @@
       CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
     }
 #endif
+#if defined(SHA512_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
 #if defined(SHA512_ASM_NOHW)
     CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif