sha: Move AArch64/X86-64 dispatching to C.

Take a step towards removing all dispatching logic from assembly
for AArch64 and X86-64.

Change-Id: I1c965012e81837ff228c810d54e730c525cad54f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64208
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv8.pl b/crypto/fipsmodule/sha/asm/sha1-armv8.pl
index 0a7a2fc..2eccfb7 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv8.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv8.pl
@@ -175,22 +175,12 @@
 
 .text
 
-.extern	OPENSSL_armcap_P
-.hidden OPENSSL_armcap_P
-.globl	sha1_block_data_order
-.type	sha1_block_data_order,%function
+.globl	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
 .align	6
-sha1_block_data_order:
+sha1_block_data_order_nohw:
 	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	AARCH64_VALID_CALL_TARGET
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
-#else
-	adrp	x16,:pg_hi21:OPENSSL_armcap_P
-#endif
-	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
-	tst	w16,#ARMV8_SHA1
-	b.ne	.Lv8_entry
 
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
@@ -239,7 +229,7 @@
 	ldp	x27,x28,[sp,#80]
 	ldr	x29,[sp],#96
 	ret
-.size	sha1_block_data_order,.-sha1_block_data_order
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
 ___
 {{{
 my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
@@ -249,12 +239,12 @@
 my $ABCD_SAVE="v22.16b";
 
 $code.=<<___;
-.type	sha1_block_armv8,%function
+.globl	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
 .align	6
-sha1_block_armv8:
+sha1_block_data_order_hw:
 	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	AARCH64_VALID_CALL_TARGET
-.Lv8_entry:
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 
@@ -318,7 +308,7 @@
 
 	ldr	x29,[sp],#16
 	ret
-.size	sha1_block_armv8,.-sha1_block_armv8
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 .section .rodata
 .align	6
 .Lconst:
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index c2c8921..886f5cf 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -237,42 +237,13 @@
 
 $code.=<<___;
 .text
-.extern	OPENSSL_ia32cap_P
 
-.globl	sha1_block_data_order
-.type	sha1_block_data_order,\@function,3
+.globl	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,\@function,3
 .align	16
-sha1_block_data_order:
+sha1_block_data_order_nohw:
 .cfi_startproc
 	_CET_ENDBR
-	leaq	OPENSSL_ia32cap_P(%rip),%r10
-	mov	0(%r10),%r9d
-	mov	4(%r10),%r8d
-	mov	8(%r10),%r10d
-	test	\$`1<<9`,%r8d		# check SSSE3 bit
-	jz	.Lialu
-___
-$code.=<<___ if ($shaext);
-	test	\$`1<<29`,%r10d		# check SHA bit
-	jnz	_shaext_shortcut
-___
-$code.=<<___ if ($avx>1);
-	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
-	cmp	\$`1<<3|1<<5|1<<8`,%r10d
-	je	_avx2_shortcut
-___
-$code.=<<___ if ($avx);
-	and	\$`1<<28`,%r8d		# mask AVX bit
-	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
-	or	%r9d,%r8d
-	cmp	\$`1<<28|1<<30`,%r8d
-	je	_avx_shortcut
-___
-$code.=<<___;
-	jmp	_ssse3_shortcut
-
-.align	16
-.Lialu:
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
 	push	%rbx
@@ -341,7 +312,7 @@
 .Lepilogue:
 	ret
 .cfi_endproc
-.size	sha1_block_data_order,.-sha1_block_data_order
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
 ___
 if ($shaext) {{{
 ######################################################################
@@ -352,11 +323,12 @@
 my @MSG=map("%xmm$_",(4..7));
 
 $code.=<<___;
-.type	sha1_block_data_order_shaext,\@function,3
+.globl	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,\@function,3
 .align	32
-sha1_block_data_order_shaext:
-_shaext_shortcut:
+sha1_block_data_order_hw:
 .cfi_startproc
+	_CET_ENDBR
 ___
 $code.=<<___ if ($win64);
 	lea	`-8-4*16`(%rsp),%rsp
@@ -457,7 +429,7 @@
 $code.=<<___;
 	ret
 .cfi_endproc
-.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
 ___
 }}}
 {{{
@@ -487,11 +459,12 @@
 }
 
 $code.=<<___;
+.globl	sha1_block_data_order_ssse3
 .type	sha1_block_data_order_ssse3,\@function,3
 .align	16
 sha1_block_data_order_ssse3:
-_ssse3_shortcut:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,$fp	# frame pointer
 .cfi_def_cfa_register	$fp
 	push	%rbx
@@ -961,11 +934,12 @@
 my $_ror=sub { &shrd(@_[0],@_) };
 
 $code.=<<___;
+.globl	sha1_block_data_order_avx
 .type	sha1_block_data_order_avx,\@function,3
 .align	16
 sha1_block_data_order_avx:
-_avx_shortcut:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,$fp
 .cfi_def_cfa_register	$fp
 	push	%rbx
@@ -1340,11 +1314,12 @@
 my $frame="%r13";
 
 $code.=<<___;
+.globl	sha1_block_data_order_avx2
 .type	sha1_block_data_order_avx2,\@function,3
 .align	16
 sha1_block_data_order_avx2:
-_avx2_shortcut:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,$fp
 .cfi_def_cfa_register	$fp
 	push	%rbx
@@ -2019,14 +1994,14 @@
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_sha1_block_data_order
-	.rva	.LSEH_end_sha1_block_data_order
-	.rva	.LSEH_info_sha1_block_data_order
+	.rva	.LSEH_begin_sha1_block_data_order_nohw
+	.rva	.LSEH_end_sha1_block_data_order_nohw
+	.rva	.LSEH_info_sha1_block_data_order_nohw
 ___
 $code.=<<___ if ($shaext);
-	.rva	.LSEH_begin_sha1_block_data_order_shaext
-	.rva	.LSEH_end_sha1_block_data_order_shaext
-	.rva	.LSEH_info_sha1_block_data_order_shaext
+	.rva	.LSEH_begin_sha1_block_data_order_hw
+	.rva	.LSEH_end_sha1_block_data_order_hw
+	.rva	.LSEH_info_sha1_block_data_order_hw
 ___
 $code.=<<___;
 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
@@ -2046,12 +2021,12 @@
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_sha1_block_data_order:
+.LSEH_info_sha1_block_data_order_nohw:
 	.byte	9,0,0,0
 	.rva	se_handler
 ___
 $code.=<<___ if ($shaext);
-.LSEH_info_sha1_block_data_order_shaext:
+.LSEH_info_sha1_block_data_order_hw:
 	.byte	9,0,0,0
 	.rva	shaext_handler
 ___
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
index c7d9154..0ec0f51 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
@@ -76,7 +76,7 @@
     *STDOUT=*OUT;
 }
 
-$func="sha${BITS}_block_data_order";
+$func="sha${BITS}_block_data_order_nohw";
 
 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
 
@@ -180,31 +180,10 @@
 
 .text
 
-.extern	OPENSSL_armcap_P
-.hidden	OPENSSL_armcap_P
 .globl	$func
 .type	$func,%function
 .align	6
 $func:
-	AARCH64_VALID_CALL_TARGET
-#ifndef	__KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
-	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
-#else
-	adrp	x16,:pg_hi21:OPENSSL_armcap_P
-#endif
-	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
-___
-$code.=<<___	if ($SZ==4);
-	tst	w16,#ARMV8_SHA256
-	b.ne	.Lv8_entry
-___
-$code.=<<___	if ($SZ==8);
-	tst	w16,#ARMV8_SHA512
-	b.ne	.Lv8_entry
-___
-$code.=<<___;
-#endif
 	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
@@ -356,11 +335,12 @@
 $code.=<<___;
 .text
 #ifndef	__KERNEL__
-.type	sha256_block_armv8,%function
+.globl	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
 .align	6
-sha256_block_armv8:
-.Lv8_entry:
+sha256_block_data_order_hw:
 	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp		x29,x30,[sp,#-16]!
 	add		x29,sp,#0
 
@@ -425,7 +405,7 @@
 
 	ldr		x29,[sp],#16
 	ret
-.size	sha256_block_armv8,.-sha256_block_armv8
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 #endif
 ___
 }
@@ -442,10 +422,12 @@
 $code.=<<___;
 .text
 #ifndef	__KERNEL__
-.type	sha512_block_armv8,%function
+.globl	sha512_block_data_order_hw
+.type	sha512_block_data_order_hw,%function
 .align	6
-sha512_block_armv8:
-.Lv8_entry:
+sha512_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp		x29,x30,[sp,#-16]!
 	add		x29,sp,#0
 
@@ -529,7 +511,7 @@
 
 	ldr		x29,[sp],#16
 	ret
-.size	sha512_block_armv8,.-sha512_block_armv8
+.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
 #endif
 ___
 }
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index a3e4122..45bb81c 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -257,42 +257,12 @@
 $code=<<___;
 .text
 
-.extern	OPENSSL_ia32cap_P
-.globl	$func
-.type	$func,\@function,3
+.globl	${func}_nohw
+.type	${func}_nohw,\@function,3
 .align	16
-$func:
+${func}_nohw:
 .cfi_startproc
 	_CET_ENDBR
-___
-$code.=<<___ if ($SZ==4 || $avx);
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	mov	0(%r11),%r9d
-	mov	4(%r11),%r10d
-	mov	8(%r11),%r11d
-___
-$code.=<<___ if ($SZ==4 && $shaext);
-	test	\$`1<<29`,%r11d		# check for SHA
-	jnz	.Lshaext_shortcut
-___
-    # XOP codepath removed.
-$code.=<<___ if ($avx>1);
-	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
-	cmp	\$`1<<8|1<<5|1<<3`,%r11d
-	je	.Lavx2_shortcut
-___
-$code.=<<___ if ($avx);
-	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
-	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
-	or	%r9d,%r10d
-	cmp	\$`1<<28|1<<9|1<<30`,%r10d
-	je	.Lavx_shortcut
-___
-$code.=<<___ if ($SZ==4);
-	test	\$`1<<9`,%r10d
-	jnz	.Lssse3_shortcut
-___
-$code.=<<___;
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
@@ -400,7 +370,7 @@
 .Lepilogue:
 	ret
 .cfi_endproc
-.size	$func,.-$func
+.size	${func}_nohw,.-${func}_nohw
 ___
 
 if ($SZ==4) {
@@ -558,11 +528,12 @@
 my @MSG=map("%xmm$_",(3..6));
 
 $code.=<<___;
-.type	sha256_block_data_order_shaext,\@function,3
+.globl	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,\@function,3
 .align	64
-sha256_block_data_order_shaext:
+sha256_block_data_order_hw:
 .cfi_startproc
-.Lshaext_shortcut:
+	_CET_ENDBR
 ___
 $code.=<<___ if ($win64);
 	lea	`-8-5*16`(%rsp),%rsp
@@ -707,7 +678,7 @@
 $code.=<<___;
 	ret
 .cfi_endproc
-.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
 ___
 }}}
 {{{
@@ -772,11 +743,12 @@
 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
 
 $code.=<<___;
+.globl	${func}_ssse3
 .type	${func}_ssse3,\@function,3
 .align	64
 ${func}_ssse3:
 .cfi_startproc
-.Lssse3_shortcut:
+	_CET_ENDBR
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
@@ -1135,11 +1107,12 @@
 local *ror = sub { &shrd(@_[0],@_) };
 
 $code.=<<___;
+.globl	${func}_avx
 .type	${func}_avx,\@function,3
 .align	64
 ${func}_avx:
 .cfi_startproc
-.Lavx_shortcut:
+	_CET_ENDBR
 	mov	%rsp,%rax		# copy %rsp
 .cfi_def_cfa_register	%rax
 	push	%rbx
@@ -2005,14 +1978,14 @@
 $code.=<<___;
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_$func
-	.rva	.LSEH_end_$func
-	.rva	.LSEH_info_$func
+	.rva	.LSEH_begin_${func}_nohw
+	.rva	.LSEH_end_${func}_nohw
+	.rva	.LSEH_info_${func}_nohw
 ___
 $code.=<<___ if ($SZ==4 && $shaext);
-	.rva	.LSEH_begin_${func}_shaext
-	.rva	.LSEH_end_${func}_shaext
-	.rva	.LSEH_info_${func}_shaext
+	.rva	.LSEH_begin_${func}_hw
+	.rva	.LSEH_end_${func}_hw
+	.rva	.LSEH_info_${func}_hw
 ___
 $code.=<<___ if ($SZ==4);
 	.rva	.LSEH_begin_${func}_ssse3
@@ -2032,13 +2005,13 @@
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_$func:
+.LSEH_info_${func}_nohw:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lprologue,.Lepilogue			# HandlerData[]
 ___
 $code.=<<___ if ($SZ==4 && $shaext);
-.LSEH_info_${func}_shaext:
+.LSEH_info_${func}_hw:
 	.byte	9,0,0,0
 	.rva	shaext_handler
 ___
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 605f166..7dbab6b 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -17,25 +17,156 @@
 
 #include <openssl/base.h>
 
+#include "../../internal.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
+// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
+// defined in assembly.
 
-#if !defined(OPENSSL_NO_ASM) &&                         \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
-     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+
 #define SHA1_ASM
 #define SHA256_ASM
 #define SHA512_ASM
-void sha1_block_data_order(uint32_t *state, const uint8_t *in,
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data,
                            size_t num_blocks);
-void sha256_block_data_order(uint32_t *state, const uint8_t *in,
+void sha256_block_data_order(uint32_t *state, const uint8_t *data,
                              size_t num_blocks);
-void sha512_block_data_order(uint64_t *state, const uint8_t *in,
+void sha512_block_data_order(uint64_t *state, const uint8_t *data,
                              size_t num_blocks);
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA512_ASM_HW
+OPENSSL_INLINE int sha512_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA512_capable();
+}
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_x86_SHA_capable() && CRYPTO_is_SSSE3_capable();
+}
+
+#define SHA1_ASM_AVX2
+OPENSSL_INLINE int sha1_avx2_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX2 imply SSSE3?
+  //  * sha1_block_data_order_avx2 does not seem to use SSSE3 instructions.
+  return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_avx2(uint32_t *state, const uint8_t *data,
+                                size_t num);
+
+#define SHA1_ASM_AVX
+OPENSSL_INLINE int sha1_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha1_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha1_block_data_order_avx(uint32_t *state, const uint8_t *data,
+                               size_t num);
+
+#define SHA1_ASM_SSSE3
+OPENSSL_INLINE int sha1_ssse3_capable(void) {
+  return CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_ssse3(uint32_t *state, const uint8_t *data,
+                                 size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_x86_SHA_capable();
+}
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha256_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha256_block_data_order_avx(uint32_t *state, const uint8_t *data,
+                                 size_t num);
+
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+  return CRYPTO_is_SSSE3_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t *state, const uint8_t *data,
+                                   size_t num);
+
+#define SHA512_ASM_AVX
+OPENSSL_INLINE int sha512_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha512_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha512_block_data_order_avx(uint64_t *state, const uint8_t *data,
+                                 size_t num);
+
 #endif
 
+#if defined(SHA1_ASM_HW)
+void sha1_block_data_order_hw(uint32_t *state, const uint8_t *data, size_t num);
+#endif
+#if defined(SHA1_ASM_NOHW)
+void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+                                size_t num);
+#endif
+
+#if defined(SHA256_ASM_HW)
+void sha256_block_data_order_hw(uint32_t *state, const uint8_t *data,
+                                size_t num);
+#endif
+#if defined(SHA256_ASM_NOHW)
+void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+                                  size_t num);
+#endif
+
+#if defined(SHA512_ASM_HW)
+void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
+                                size_t num);
+#endif
+#if defined(SHA512_ASM_NOHW)
+void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
+                                  size_t num);
+#endif
 
 #if defined(__cplusplus)
 }  // extern "C"
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 4baeed6..7b267e3 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -232,8 +232,10 @@
 #define X(i)  XX##i
 
 #if !defined(SHA1_ASM)
-static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
-                                  size_t num) {
+
+#if !defined(SHA1_ASM_NOHW)
+static void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+                                       size_t num) {
   register uint32_t A, B, C, D, E, T;
   uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
       XX11, XX12, XX13, XX14, XX15;
@@ -380,7 +382,38 @@
     E = state[4];
   }
 }
+#endif  // !SHA1_ASM_NOHW
+
+static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
+                                  size_t num) {
+#if defined(SHA1_ASM_HW)
+  if (sha1_hw_capable()) {
+    sha1_block_data_order_hw(state, data, num);
+    return;
+  }
 #endif
+#if defined(SHA1_ASM_AVX2)
+  if (sha1_avx2_capable()) {
+    sha1_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_AVX)
+  if (sha1_avx_capable()) {
+    sha1_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+  if (sha1_ssse3_capable()) {
+    sha1_block_data_order_ssse3(state, data, num);
+    return;
+  }
+#endif
+  sha1_block_data_order_nohw(state, data, num);
+}
+
+#endif  // !SHA1_ASM
 
 #undef Xupdate
 #undef K_00_19
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 046f6e2..0b0aca2 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -114,7 +114,7 @@
   return out;
 }
 
-#ifndef SHA256_ASM
+#if !defined(SHA256_ASM)
 static void sha256_block_data_order(uint32_t *state, const uint8_t *in,
                                     size_t num);
 #endif
@@ -172,7 +172,9 @@
   return sha256_final_impl(out, SHA224_DIGEST_LENGTH, ctx);
 }
 
-#ifndef SHA256_ASM
+#if !defined(SHA256_ASM)
+
+#if !defined(SHA256_ASM_NOHW)
 static const uint32_t K256[64] = {
     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
     0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
@@ -221,8 +223,8 @@
     ROUND_00_15(i, a, b, c, d, e, f, g, h);            \
   } while (0)
 
-static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
-                                    size_t num) {
+static void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+                                         size_t num) {
   uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
   uint32_t X[16];
   int i;
@@ -308,7 +310,33 @@
   }
 }
 
-#endif  // !SHA256_ASM
+#endif  // !defined(SHA256_ASM_NOHW)
+
+static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
+                                    size_t num) {
+#if defined(SHA256_ASM_HW)
+  if (sha256_hw_capable()) {
+    sha256_block_data_order_hw(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA256_ASM_AVX)
+  if (sha256_avx_capable()) {
+    sha256_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+  if (sha256_ssse3_capable()) {
+    sha256_block_data_order_ssse3(state, data, num);
+    return;
+  }
+#endif
+  sha256_block_data_order_nohw(state, data, num);
+}
+
+#endif  // !defined(SHA256_ASM)
+
 
 void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data,
                             size_t num_blocks) {
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 2c7ce31..0f4142c 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -279,7 +279,9 @@
   return 1;
 }
 
-#ifndef SHA512_ASM
+#if !defined(SHA512_ASM)
+
+#if !defined(SHA512_ASM_NOHW)
 static const uint64_t K512[80] = {
     UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd),
     UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc),
@@ -341,8 +343,8 @@
 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 // This code should give better results on 32-bit CPU with less than
 // ~24 registers, both size and performance wise...
-static void sha512_block_data_order(uint64_t *state, const uint8_t *in,
-                                    size_t num) {
+static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in,
+                                         size_t num) {
   uint64_t A, E, T;
   uint64_t X[9 + 80], *F;
   int i;
@@ -414,8 +416,8 @@
     ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
   } while (0)
 
-static void sha512_block_data_order(uint64_t *state, const uint8_t *in,
-                                    size_t num) {
+static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in,
+                                         size_t num) {
   uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
   uint64_t X[16];
   int i;
@@ -498,6 +500,25 @@
 
 #endif
 
+#endif  // !SHA512_ASM_NOHW
+
+static void sha512_block_data_order(uint64_t *state, const uint8_t *data,
+                                    size_t num) {
+#if defined(SHA512_ASM_HW)
+  if (sha512_hw_capable()) {
+    sha512_block_data_order_hw(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA512_ASM_AVX)
+  if (sha512_avx_capable()) {
+    sha512_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+  sha512_block_data_order_nohw(state, data, num);
+}
+
 #endif  // !SHA512_ASM
 
 #undef Sigma0
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 5029bb0..22856f8 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -42,41 +42,100 @@
   }
 }
 
-#if defined(SHA1_ASM) && defined(SUPPORTS_ABI_TEST)
+#if defined(SUPPORTS_ABI_TEST)
+
 TEST(SHATest, SHA1ABI) {
   SHA_CTX ctx;
   SHA1_Init(&ctx);
 
   static const uint8_t kBuf[SHA_CBLOCK * 8] = {0};
-  CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 1);
-  CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 2);
-  CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 4);
-  CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 8);
+  for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA1_ASM)
+    CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA1_ASM_HW)
+    if (sha1_hw_capable()) {
+      CHECK_ABI(sha1_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_AVX2)
+    if (sha1_avx2_capable()) {
+      CHECK_ABI(sha1_block_data_order_avx2, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_AVX)
+    if (sha1_avx_capable()) {
+      CHECK_ABI(sha1_block_data_order_avx, ctx.h, kBuf, blocks);
+      return;
+    }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+    if (sha1_ssse3_capable()) {
+      CHECK_ABI(sha1_block_data_order_ssse3, ctx.h, kBuf, blocks);
+      return;
+    }
+#endif
+#if defined(SHA1_ASM_NOHW)
+    CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
 }
-#endif  // SHA1_ASM && SUPPORTS_ABI_TEST
 
-#if defined(SHA256_ASM) && defined(SUPPORTS_ABI_TEST)
 TEST(SHATest, SHA256ABI) {
   SHA256_CTX ctx;
   SHA256_Init(&ctx);
 
   static const uint8_t kBuf[SHA256_CBLOCK * 8] = {0};
-  CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 1);
-  CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 2);
-  CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 4);
-  CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 8);
+  for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA256_ASM)
+    CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA256_ASM_HW)
+    if (sha256_hw_capable()) {
+      CHECK_ABI(sha256_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA256_ASM_AVX)
+    if (sha256_avx_capable()) {
+      CHECK_ABI(sha256_block_data_order_avx, ctx.h, kBuf, blocks);
+      return;
+    }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+    if (sha256_ssse3_capable()) {
+      CHECK_ABI(sha256_block_data_order_ssse3, ctx.h, kBuf, blocks);
+      return;
+    }
+#endif
+#if defined(SHA256_ASM_NOHW)
+    CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
 }
-#endif  // SHA256_ASM && SUPPORTS_ABI_TEST
 
-#if defined(SHA512_ASM) && defined(SUPPORTS_ABI_TEST)
 TEST(SHATest, SHA512ABI) {
   SHA512_CTX ctx;
   SHA512_Init(&ctx);
 
   static const uint8_t kBuf[SHA512_CBLOCK * 4] = {0};
-  CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 1);
-  CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 2);
-  CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 3);
-  CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 4);
+  for (size_t blocks : {1, 2, 3, 4}) {
+#if defined(SHA512_ASM)
+    CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA512_ASM_HW)
+    if (sha512_hw_capable()) {
+      CHECK_ABI(sha512_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA512_ASM_AVX)
+    if (sha512_avx_capable()) {
+      CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA512_ASM_NOHW)
+    CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
 }
-#endif  // SHA512_ASM && SUPPORTS_ABI_TEST
+
+#endif  // SUPPORTS_ABI_TEST
diff --git a/crypto/internal.h b/crypto/internal.h
index f2db41c..e9da010 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1450,6 +1450,15 @@
 #endif
 }
 
+// SHA-1 and SHA-256 are defined as a single extension.
+OPENSSL_INLINE int CRYPTO_is_x86_SHA_capable(void) {
+#if defined(__SHA__)
+  return 1;
+#else
+  return (OPENSSL_get_ia32cap(2) & (1u << 29)) != 0;
+#endif
+}
+
 #endif  // OPENSSL_X86 || OPENSSL_X86_64
 
 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
@@ -1519,6 +1528,41 @@
 #endif
 }
 
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA1_capable(void) {
+  // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise
+  // are dealt with independently.
+#if defined(OPENSSL_STATIC_ARMCAP_SHA1) || defined(__ARM_FEATURE_SHA2)
+  return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+  return 0;
+#else
+  return (OPENSSL_get_armcap() & ARMV8_SHA1) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA256_capable(void) {
+  // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise
+  // are dealt with independently.
+#if defined(OPENSSL_STATIC_ARMCAP_SHA256) || defined(__ARM_FEATURE_SHA2)
+  return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+  return 0;
+#else
+  return (OPENSSL_get_armcap() & ARMV8_SHA256) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA512_capable(void) {
+  // There is no |OPENSSL_STATIC_ARMCAP_SHA512|.
+#if defined(__ARM_FEATURE_SHA512)
+  return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+  return 0;
+#else
+  return (OPENSSL_get_armcap() & ARMV8_SHA512) != 0;
+#endif
+}
+
 #endif  // OPENSSL_ARM || OPENSSL_AARCH64
 
 #if defined(BORINGSSL_DISPATCH_TEST)