Revert "sha: Move Armv7 dispatching to C"

google3 is unhappy with this and needs some more love on where
the adr target gets placed to fit.

This reverts commit 62f43f5ea57b9b208fc784e5fa959bce89ebd718.

Change-Id: I1e335c635590fdda72a8a98314a1640d5b7ea179
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65328
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
index 532a81b..c52b546 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -197,11 +197,24 @@
 .code	32
 #endif
 
-.global	sha1_block_data_order_nohw
-.type	sha1_block_data_order_nohw,%function
+.global	sha1_block_data_order
+.type	sha1_block_data_order,%function
 
 .align	5
-sha1_block_data_order_nohw:
+sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+.Lsha1_block:
+	adr	r3,.Lsha1_block
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA1
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	ldmia	$ctx,{$a,$b,$c,$d,$e}
@@ -291,13 +304,17 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+.size	sha1_block_data_order,.-sha1_block_data_order
 
 .align	5
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
 .LK_40_59:	.word	0x8f1bbcdc
 .LK_60_79:	.word	0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha1_block
+#endif
 .asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 ___
@@ -513,10 +530,10 @@
 .arch	armv7-a
 .fpu	neon
 
-.global	sha1_block_data_order_neon
 .type	sha1_block_data_order_neon,%function
 .align	4
 sha1_block_data_order_neon:
+.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
 	@ dmb				@ errata #451034 on early Cortex A8
@@ -608,10 +625,10 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
 # endif
 
-.global	sha1_block_data_order_hw
-.type	sha1_block_data_order_hw,%function
+.type	sha1_block_data_order_armv8,%function
 .align	5
-sha1_block_data_order_hw:
+sha1_block_data_order_armv8:
+.LARMv8:
 	vstmdb	sp!,{d8-d15}		@ ABI specification says so
 
 	veor	$E,$E,$E
@@ -676,10 +693,16 @@
 
 	vldmia	sp!,{d8-d15}
 	ret					@ bx lr
-.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 #endif
 ___
 }}}
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+___
 
 {   my  %opcode = (
 	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
index 59f3417..fa82f3c 100644
--- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -217,15 +217,34 @@
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha256_block_data_order
+#endif
 .align	5
 
-.global	sha256_block_data_order_nohw
-.type	sha256_block_data_order_nohw,%function
-sha256_block_data_order_nohw:
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+	adr	r3,.Lsha256_block_data_order
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-	adr	$Ktbl,K256
+	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
+	@ replace this with a simple ADR.
+	sub	$Ktbl,r3,#256+32	@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH>=7
@@ -279,7 +298,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+.size	sha256_block_data_order,.-sha256_block_data_order
 ___
 ######################################################################
 # NEON stuff
@@ -464,12 +483,10 @@
 .align	5
 .skip	16
 sha256_block_data_order_neon:
+.LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
-	@ In Arm mode, the following ADR runs up against the limits of encodable
-	@ offsets. It only fits because the offset, when the ADR is placed here,
-	@ is a multiple of 16.
 	adr	$Ktbl,K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
@@ -596,26 +613,12 @@
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif
 
-.LK256_shortcut:
-@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
-#if defined(__thumb2__)
-.word	K256-(.LK256_add+4)
-#else
-.word	K256-(.LK256_add+8)
-#endif
-
-.global	sha256_block_data_order_hw
-.type	sha256_block_data_order_hw,%function
+.type	sha256_block_data_order_armv8,%function
 .align	5
-sha256_block_data_order_hw:
-	@ K256 is too far to reference from one ADR command in Thumb mode. In
-	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
-	@ boundary. For simplicity, just load the offset from .LK256_shortcut.
-	ldr	$Ktbl,.LK256_shortcut
-.LK256_add:
-	add	$Ktbl,pc,$Ktbl
-
+sha256_block_data_order_armv8:
+.LARMv8:
 	vld1.32	{$ABCD,$EFGH},[$ctx]
+	sub	$Ktbl,$Ktbl,#256+32
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8
 
@@ -677,13 +680,17 @@
 	vst1.32		{$ABCD,$EFGH},[$ctx]
 
 	ret		@ bx lr
-.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 #endif
 ___
 }}}
 $code.=<<___;
 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
 ___
 
 open SELF,$0;
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv4.pl b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
index f2d1d22..f52b5b0 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -276,13 +276,33 @@
 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
 
-.global	sha512_block_data_order_nohw
-.type	sha512_block_data_order_nohw,%function
-sha512_block_data_order_nohw:
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+	adr	r3,.Lsha512_block_data_order
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4-r12,lr}
-	adr	$Ktbl,K512
+	@ TODO(davidben): When the OPENSSL_armcap logic above is removed,
+	@ replace this with a simple ADR.
+	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -481,7 +501,7 @@
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+.size	sha512_block_data_order,.-sha512_block_data_order
 ___
 
 {
@@ -592,6 +612,7 @@
 .type	sha512_block_data_order_neon,%function
 .align	4
 sha512_block_data_order_neon:
+.LNEON:
 	dmb				@ errata #451034 on early Cortex A8
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 	adr	$Ktbl,K512
@@ -629,6 +650,10 @@
 $code.=<<___;
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 0c05d73..7c15b2c 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -26,7 +26,7 @@
 // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
 // defined in assembly.
 
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
+#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
 
 #define SHA1_ASM
 #define SHA256_ASM
@@ -39,35 +39,6 @@
 void sha512_block_data_order(uint64_t *state, const uint8_t *data,
                              size_t num_blocks);
 
-#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
-
-#define SHA1_ASM_NOHW
-#define SHA256_ASM_NOHW
-#define SHA512_ASM_NOHW
-
-#define SHA1_ASM_HW
-OPENSSL_INLINE int sha1_hw_capable(void) {
-  return CRYPTO_is_ARMv8_SHA1_capable();
-}
-
-#define SHA1_ASM_NEON
-void sha1_block_data_order_neon(uint32_t *state, const uint8_t *data,
-                                size_t num);
-
-#define SHA256_ASM_HW
-OPENSSL_INLINE int sha256_hw_capable(void) {
-  return CRYPTO_is_ARMv8_SHA256_capable();
-}
-
-#define SHA256_ASM_NEON
-void sha256_block_data_order_neon(uint32_t *state, const uint8_t *data,
-                                  size_t num);
-
-// Armv8.2 SHA-512 instructions are not available in 32-bit.
-#define SHA512_ASM_NEON
-void sha512_block_data_order_neon(uint64_t *state, const uint8_t *data,
-                                  size_t num);
-
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
 
 #define SHA1_ASM_NOHW
@@ -178,7 +149,6 @@
 void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
                                 size_t num);
 #endif
-
 #if defined(SHA512_ASM_NOHW)
 void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
                                   size_t num);
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 7a97266..7b267e3 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -410,12 +410,6 @@
     return;
   }
 #endif
-#if defined(SHA1_ASM_NEON)
-  if (CRYPTO_is_NEON_capable()) {
-    sha1_block_data_order_neon(state, data, num);
-    return;
-  }
-#endif
   sha1_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 8cedc5f..0b0aca2 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -332,12 +332,6 @@
     return;
   }
 #endif
-#if defined(SHA256_ASM_NEON)
-  if (CRYPTO_is_NEON_capable()) {
-    sha256_block_data_order_neon(state, data, num);
-    return;
-  }
-#endif
   sha256_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index d31ab71..0f4142c 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -516,12 +516,6 @@
     return;
   }
 #endif
-#if defined(SHA512_ASM_NEON)
-  if (CRYPTO_is_NEON_capable()) {
-    sha512_block_data_order_neon(state, data, num);
-    return;
-  }
-#endif
   sha512_block_data_order_nohw(state, data, num);
 }
 
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 671c170..22856f8 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -75,11 +75,6 @@
       return;
     }
 #endif
-#if defined(SHA1_ASM_NEON)
-    if (CRYPTO_is_NEON_capable()) {
-      CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
-    }
-#endif
 #if defined(SHA1_ASM_NOHW)
     CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -112,11 +107,6 @@
       return;
     }
 #endif
-#if defined(SHA256_ASM_NEON)
-    if (CRYPTO_is_NEON_capable()) {
-      CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
-    }
-#endif
 #if defined(SHA256_ASM_NOHW)
     CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif
@@ -142,11 +132,6 @@
       CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
     }
 #endif
-#if defined(SHA512_ASM_NEON)
-    if (CRYPTO_is_NEON_capable()) {
-      CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
-    }
-#endif
 #if defined(SHA512_ASM_NOHW)
     CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
 #endif