sha: Move AArch64/X86-64 dispatching to C.
Take a step towards removing all dispatching logic from assembly
for AArch64 and X86-64.
Change-Id: I1c965012e81837ff228c810d54e730c525cad54f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64208
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-armv8.pl b/crypto/fipsmodule/sha/asm/sha1-armv8.pl
index 0a7a2fc..2eccfb7 100644
--- a/crypto/fipsmodule/sha/asm/sha1-armv8.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-armv8.pl
@@ -175,22 +175,12 @@
.text
-.extern OPENSSL_armcap_P
-.hidden OPENSSL_armcap_P
-.globl sha1_block_data_order
-.type sha1_block_data_order,%function
+.globl sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,%function
.align 6
-sha1_block_data_order:
+sha1_block_data_order_nohw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
-#else
- adrp x16,:pg_hi21:OPENSSL_armcap_P
-#endif
- ldr w16,[x16,:lo12:OPENSSL_armcap_P]
- tst w16,#ARMV8_SHA1
- b.ne .Lv8_entry
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -239,7 +229,7 @@
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
-.size sha1_block_data_order,.-sha1_block_data_order
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
@@ -249,12 +239,12 @@
my $ABCD_SAVE="v22.16b";
$code.=<<___;
-.type sha1_block_armv8,%function
+.globl sha1_block_data_order_hw
+.type sha1_block_data_order_hw,%function
.align 6
-sha1_block_armv8:
+sha1_block_data_order_hw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
-.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
@@ -318,7 +308,7 @@
ldr x29,[sp],#16
ret
-.size sha1_block_armv8,.-sha1_block_armv8
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
.section .rodata
.align 6
.Lconst:
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index c2c8921..886f5cf 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -237,42 +237,13 @@
$code.=<<___;
.text
-.extern OPENSSL_ia32cap_P
-.globl sha1_block_data_order
-.type sha1_block_data_order,\@function,3
+.globl sha1_block_data_order_nohw
+.type sha1_block_data_order_nohw,\@function,3
.align 16
-sha1_block_data_order:
+sha1_block_data_order_nohw:
.cfi_startproc
_CET_ENDBR
- leaq OPENSSL_ia32cap_P(%rip),%r10
- mov 0(%r10),%r9d
- mov 4(%r10),%r8d
- mov 8(%r10),%r10d
- test \$`1<<9`,%r8d # check SSSE3 bit
- jz .Lialu
-___
-$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
- jnz _shaext_shortcut
-___
-$code.=<<___ if ($avx>1);
- and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
- cmp \$`1<<3|1<<5|1<<8`,%r10d
- je _avx2_shortcut
-___
-$code.=<<___ if ($avx);
- and \$`1<<28`,%r8d # mask AVX bit
- and \$`1<<30`,%r9d # mask "Intel CPU" bit
- or %r9d,%r8d
- cmp \$`1<<28|1<<30`,%r8d
- je _avx_shortcut
-___
-$code.=<<___;
- jmp _ssse3_shortcut
-
-.align 16
-.Lialu:
mov %rsp,%rax
.cfi_def_cfa_register %rax
push %rbx
@@ -341,7 +312,7 @@
.Lepilogue:
ret
.cfi_endproc
-.size sha1_block_data_order,.-sha1_block_data_order
+.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
___
if ($shaext) {{{
######################################################################
@@ -352,11 +323,12 @@
my @MSG=map("%xmm$_",(4..7));
$code.=<<___;
-.type sha1_block_data_order_shaext,\@function,3
+.globl sha1_block_data_order_hw
+.type sha1_block_data_order_hw,\@function,3
.align 32
-sha1_block_data_order_shaext:
-_shaext_shortcut:
+sha1_block_data_order_hw:
.cfi_startproc
+ _CET_ENDBR
___
$code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp
@@ -457,7 +429,7 @@
$code.=<<___;
ret
.cfi_endproc
-.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
___
}}}
{{{
@@ -487,11 +459,12 @@
}
$code.=<<___;
+.globl sha1_block_data_order_ssse3
.type sha1_block_data_order_ssse3,\@function,3
.align 16
sha1_block_data_order_ssse3:
-_ssse3_shortcut:
.cfi_startproc
+ _CET_ENDBR
mov %rsp,$fp # frame pointer
.cfi_def_cfa_register $fp
push %rbx
@@ -961,11 +934,12 @@
my $_ror=sub { &shrd(@_[0],@_) };
$code.=<<___;
+.globl sha1_block_data_order_avx
.type sha1_block_data_order_avx,\@function,3
.align 16
sha1_block_data_order_avx:
-_avx_shortcut:
.cfi_startproc
+ _CET_ENDBR
mov %rsp,$fp
.cfi_def_cfa_register $fp
push %rbx
@@ -1340,11 +1314,12 @@
my $frame="%r13";
$code.=<<___;
+.globl sha1_block_data_order_avx2
.type sha1_block_data_order_avx2,\@function,3
.align 16
sha1_block_data_order_avx2:
-_avx2_shortcut:
.cfi_startproc
+ _CET_ENDBR
mov %rsp,$fp
.cfi_def_cfa_register $fp
push %rbx
@@ -2019,14 +1994,14 @@
.section .pdata
.align 4
- .rva .LSEH_begin_sha1_block_data_order
- .rva .LSEH_end_sha1_block_data_order
- .rva .LSEH_info_sha1_block_data_order
+ .rva .LSEH_begin_sha1_block_data_order_nohw
+ .rva .LSEH_end_sha1_block_data_order_nohw
+ .rva .LSEH_info_sha1_block_data_order_nohw
___
$code.=<<___ if ($shaext);
- .rva .LSEH_begin_sha1_block_data_order_shaext
- .rva .LSEH_end_sha1_block_data_order_shaext
- .rva .LSEH_info_sha1_block_data_order_shaext
+ .rva .LSEH_begin_sha1_block_data_order_hw
+ .rva .LSEH_end_sha1_block_data_order_hw
+ .rva .LSEH_info_sha1_block_data_order_hw
___
$code.=<<___;
.rva .LSEH_begin_sha1_block_data_order_ssse3
@@ -2046,12 +2021,12 @@
$code.=<<___;
.section .xdata
.align 8
-.LSEH_info_sha1_block_data_order:
+.LSEH_info_sha1_block_data_order_nohw:
.byte 9,0,0,0
.rva se_handler
___
$code.=<<___ if ($shaext);
-.LSEH_info_sha1_block_data_order_shaext:
+.LSEH_info_sha1_block_data_order_hw:
.byte 9,0,0,0
.rva shaext_handler
___
diff --git a/crypto/fipsmodule/sha/asm/sha512-armv8.pl b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
index c7d9154..0ec0f51 100644
--- a/crypto/fipsmodule/sha/asm/sha512-armv8.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-armv8.pl
@@ -76,7 +76,7 @@
*STDOUT=*OUT;
}
-$func="sha${BITS}_block_data_order";
+$func="sha${BITS}_block_data_order_nohw";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@@ -180,31 +180,10 @@
.text
-.extern OPENSSL_armcap_P
-.hidden OPENSSL_armcap_P
.globl $func
.type $func,%function
.align 6
$func:
- AARCH64_VALID_CALL_TARGET
-#ifndef __KERNEL__
-#if defined(OPENSSL_HWASAN) && __clang_major__ >= 10
- adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
-#else
- adrp x16,:pg_hi21:OPENSSL_armcap_P
-#endif
- ldr w16,[x16,:lo12:OPENSSL_armcap_P]
-___
-$code.=<<___ if ($SZ==4);
- tst w16,#ARMV8_SHA256
- b.ne .Lv8_entry
-___
-$code.=<<___ if ($SZ==8);
- tst w16,#ARMV8_SHA512
- b.ne .Lv8_entry
-___
-$code.=<<___;
-#endif
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
@@ -356,11 +335,12 @@
$code.=<<___;
.text
#ifndef __KERNEL__
-.type sha256_block_armv8,%function
+.globl sha256_block_data_order_hw
+.type sha256_block_data_order_hw,%function
.align 6
-sha256_block_armv8:
-.Lv8_entry:
+sha256_block_data_order_hw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
@@ -425,7 +405,7 @@
ldr x29,[sp],#16
ret
-.size sha256_block_armv8,.-sha256_block_armv8
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}
@@ -442,10 +422,12 @@
$code.=<<___;
.text
#ifndef __KERNEL__
-.type sha512_block_armv8,%function
+.globl sha512_block_data_order_hw
+.type sha512_block_data_order_hw,%function
.align 6
-sha512_block_armv8:
-.Lv8_entry:
+sha512_block_data_order_hw:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
@@ -529,7 +511,7 @@
ldr x29,[sp],#16
ret
-.size sha512_block_armv8,.-sha512_block_armv8
+.size sha512_block_data_order_hw,.-sha512_block_data_order_hw
#endif
___
}
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index a3e4122..45bb81c 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -257,42 +257,12 @@
$code=<<___;
.text
-.extern OPENSSL_ia32cap_P
-.globl $func
-.type $func,\@function,3
+.globl ${func}_nohw
+.type ${func}_nohw,\@function,3
.align 16
-$func:
+${func}_nohw:
.cfi_startproc
_CET_ENDBR
-___
-$code.=<<___ if ($SZ==4 || $avx);
- leaq OPENSSL_ia32cap_P(%rip),%r11
- mov 0(%r11),%r9d
- mov 4(%r11),%r10d
- mov 8(%r11),%r11d
-___
-$code.=<<___ if ($SZ==4 && $shaext);
- test \$`1<<29`,%r11d # check for SHA
- jnz .Lshaext_shortcut
-___
- # XOP codepath removed.
-$code.=<<___ if ($avx>1);
- and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
- cmp \$`1<<8|1<<5|1<<3`,%r11d
- je .Lavx2_shortcut
-___
-$code.=<<___ if ($avx);
- and \$`1<<30`,%r9d # mask "Intel CPU" bit
- and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
- or %r9d,%r10d
- cmp \$`1<<28|1<<9|1<<30`,%r10d
- je .Lavx_shortcut
-___
-$code.=<<___ if ($SZ==4);
- test \$`1<<9`,%r10d
- jnz .Lssse3_shortcut
-___
-$code.=<<___;
mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx
@@ -400,7 +370,7 @@
.Lepilogue:
ret
.cfi_endproc
-.size $func,.-$func
+.size ${func}_nohw,.-${func}_nohw
___
if ($SZ==4) {
@@ -558,11 +528,12 @@
my @MSG=map("%xmm$_",(3..6));
$code.=<<___;
-.type sha256_block_data_order_shaext,\@function,3
+.globl sha256_block_data_order_hw
+.type sha256_block_data_order_hw,\@function,3
.align 64
-sha256_block_data_order_shaext:
+sha256_block_data_order_hw:
.cfi_startproc
-.Lshaext_shortcut:
+ _CET_ENDBR
___
$code.=<<___ if ($win64);
lea `-8-5*16`(%rsp),%rsp
@@ -707,7 +678,7 @@
$code.=<<___;
ret
.cfi_endproc
-.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
+.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
___
}}}
{{{
@@ -772,11 +743,12 @@
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
$code.=<<___;
+.globl ${func}_ssse3
.type ${func}_ssse3,\@function,3
.align 64
${func}_ssse3:
.cfi_startproc
-.Lssse3_shortcut:
+ _CET_ENDBR
mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx
@@ -1135,11 +1107,12 @@
local *ror = sub { &shrd(@_[0],@_) };
$code.=<<___;
+.globl ${func}_avx
.type ${func}_avx,\@function,3
.align 64
${func}_avx:
.cfi_startproc
-.Lavx_shortcut:
+ _CET_ENDBR
mov %rsp,%rax # copy %rsp
.cfi_def_cfa_register %rax
push %rbx
@@ -2005,14 +1978,14 @@
$code.=<<___;
.section .pdata
.align 4
- .rva .LSEH_begin_$func
- .rva .LSEH_end_$func
- .rva .LSEH_info_$func
+ .rva .LSEH_begin_${func}_nohw
+ .rva .LSEH_end_${func}_nohw
+ .rva .LSEH_info_${func}_nohw
___
$code.=<<___ if ($SZ==4 && $shaext);
- .rva .LSEH_begin_${func}_shaext
- .rva .LSEH_end_${func}_shaext
- .rva .LSEH_info_${func}_shaext
+ .rva .LSEH_begin_${func}_hw
+ .rva .LSEH_end_${func}_hw
+ .rva .LSEH_info_${func}_hw
___
$code.=<<___ if ($SZ==4);
.rva .LSEH_begin_${func}_ssse3
@@ -2032,13 +2005,13 @@
$code.=<<___;
.section .xdata
.align 8
-.LSEH_info_$func:
+.LSEH_info_${func}_nohw:
.byte 9,0,0,0
.rva se_handler
.rva .Lprologue,.Lepilogue # HandlerData[]
___
$code.=<<___ if ($SZ==4 && $shaext);
-.LSEH_info_${func}_shaext:
+.LSEH_info_${func}_hw:
.byte 9,0,0,0
.rva shaext_handler
___
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 605f166..7dbab6b 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -17,25 +17,156 @@
#include <openssl/base.h>
+#include "../../internal.h"
+
#if defined(__cplusplus)
extern "C" {
#endif
+// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
+// defined in assembly.
-#if !defined(OPENSSL_NO_ASM) && \
- (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
- defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_ARM))
+
#define SHA1_ASM
#define SHA256_ASM
#define SHA512_ASM
-void sha1_block_data_order(uint32_t *state, const uint8_t *in,
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data,
size_t num_blocks);
-void sha256_block_data_order(uint32_t *state, const uint8_t *in,
+void sha256_block_data_order(uint32_t *state, const uint8_t *data,
size_t num_blocks);
-void sha512_block_data_order(uint64_t *state, const uint8_t *in,
+void sha512_block_data_order(uint64_t *state, const uint8_t *data,
size_t num_blocks);
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA512_ASM_HW
+OPENSSL_INLINE int sha512_hw_capable(void) {
+ return CRYPTO_is_ARMv8_SHA512_capable();
+}
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+ return CRYPTO_is_x86_SHA_capable() && CRYPTO_is_SSSE3_capable();
+}
+
+#define SHA1_ASM_AVX2
+OPENSSL_INLINE int sha1_avx2_capable(void) {
+ // TODO: Simplify this logic, which was extracted from the assembly:
+ // * Does AVX2 imply SSSE3?
+ // * sha1_block_data_order_avx2 does not seem to use SSSE3 instructions.
+ return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() &&
+ CRYPTO_is_BMI1_capable() && CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_avx2(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA1_ASM_AVX
+OPENSSL_INLINE int sha1_avx_capable(void) {
+ // TODO: Simplify this logic, which was extracted from the assembly:
+ // * Does AVX imply SSSE3?
+ // * sha1_block_data_order_avx does not seem to use SSSE3 instructions.
+ // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+ // discussion in sha1-586.pl.
+ return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+ CRYPTO_is_intel_cpu();
+}
+void sha1_block_data_order_avx(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA1_ASM_SSSE3
+OPENSSL_INLINE int sha1_ssse3_capable(void) {
+ return CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_ssse3(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+ return CRYPTO_is_x86_SHA_capable();
+}
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+ // TODO: Simplify this logic, which was extracted from the assembly:
+ // * Does AVX imply SSSE3?
+ // * sha256_block_data_order_avx does not seem to use SSSE3 instructions.
+ // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+ // discussion in sha1-586.pl.
+ return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+ CRYPTO_is_intel_cpu();
+}
+void sha256_block_data_order_avx(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+ return CRYPTO_is_SSSE3_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t *state, const uint8_t *data,
+ size_t num);
+
+#define SHA512_ASM_AVX
+OPENSSL_INLINE int sha512_avx_capable(void) {
+ // TODO: Simplify this logic, which was extracted from the assembly:
+ // * Does AVX imply SSSE3?
+ // * sha512_block_data_order_avx does not seem to use SSSE3 instructions.
+ // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+ // discussion in sha1-586.pl.
+ return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+ CRYPTO_is_intel_cpu();
+}
+void sha512_block_data_order_avx(uint64_t *state, const uint8_t *data,
+ size_t num);
+
#endif
+#if defined(SHA1_ASM_HW)
+void sha1_block_data_order_hw(uint32_t *state, const uint8_t *data, size_t num);
+#endif
+#if defined(SHA1_ASM_NOHW)
+void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+ size_t num);
+#endif
+
+#if defined(SHA256_ASM_HW)
+void sha256_block_data_order_hw(uint32_t *state, const uint8_t *data,
+ size_t num);
+#endif
+#if defined(SHA256_ASM_NOHW)
+void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+ size_t num);
+#endif
+
+#if defined(SHA512_ASM_HW)
+void sha512_block_data_order_hw(uint64_t *state, const uint8_t *data,
+ size_t num);
+#endif
+#if defined(SHA512_ASM_NOHW)
+void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *data,
+ size_t num);
+#endif
#if defined(__cplusplus)
} // extern "C"
diff --git a/crypto/fipsmodule/sha/sha1.c b/crypto/fipsmodule/sha/sha1.c
index 4baeed6..7b267e3 100644
--- a/crypto/fipsmodule/sha/sha1.c
+++ b/crypto/fipsmodule/sha/sha1.c
@@ -232,8 +232,10 @@
#define X(i) XX##i
#if !defined(SHA1_ASM)
-static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
- size_t num) {
+
+#if !defined(SHA1_ASM_NOHW)
+static void sha1_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+ size_t num) {
register uint32_t A, B, C, D, E, T;
uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
XX11, XX12, XX13, XX14, XX15;
@@ -380,7 +382,38 @@
E = state[4];
}
}
+#endif // !SHA1_ASM_NOHW
+
+static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
+ size_t num) {
+#if defined(SHA1_ASM_HW)
+ if (sha1_hw_capable()) {
+ sha1_block_data_order_hw(state, data, num);
+ return;
+ }
#endif
+#if defined(SHA1_ASM_AVX2)
+ if (sha1_avx2_capable()) {
+ sha1_block_data_order_avx(state, data, num);
+ return;
+ }
+#endif
+#if defined(SHA1_ASM_AVX)
+ if (sha1_avx_capable()) {
+ sha1_block_data_order_avx(state, data, num);
+ return;
+ }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+ if (sha1_ssse3_capable()) {
+ sha1_block_data_order_ssse3(state, data, num);
+ return;
+ }
+#endif
+ sha1_block_data_order_nohw(state, data, num);
+}
+
+#endif // !SHA1_ASM
#undef Xupdate
#undef K_00_19
diff --git a/crypto/fipsmodule/sha/sha256.c b/crypto/fipsmodule/sha/sha256.c
index 046f6e2..0b0aca2 100644
--- a/crypto/fipsmodule/sha/sha256.c
+++ b/crypto/fipsmodule/sha/sha256.c
@@ -114,7 +114,7 @@
return out;
}
-#ifndef SHA256_ASM
+#if !defined(SHA256_ASM)
static void sha256_block_data_order(uint32_t *state, const uint8_t *in,
size_t num);
#endif
@@ -172,7 +172,9 @@
return sha256_final_impl(out, SHA224_DIGEST_LENGTH, ctx);
}
-#ifndef SHA256_ASM
+#if !defined(SHA256_ASM)
+
+#if !defined(SHA256_ASM_NOHW)
static const uint32_t K256[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
@@ -221,8 +223,8 @@
ROUND_00_15(i, a, b, c, d, e, f, g, h); \
} while (0)
-static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
- size_t num) {
+static void sha256_block_data_order_nohw(uint32_t *state, const uint8_t *data,
+ size_t num) {
uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
uint32_t X[16];
int i;
@@ -308,7 +310,33 @@
}
}
-#endif // !SHA256_ASM
+#endif // !defined(SHA256_ASM_NOHW)
+
+static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
+ size_t num) {
+#if defined(SHA256_ASM_HW)
+ if (sha256_hw_capable()) {
+ sha256_block_data_order_hw(state, data, num);
+ return;
+ }
+#endif
+#if defined(SHA256_ASM_AVX)
+ if (sha256_avx_capable()) {
+ sha256_block_data_order_avx(state, data, num);
+ return;
+ }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+ if (sha256_ssse3_capable()) {
+ sha256_block_data_order_ssse3(state, data, num);
+ return;
+ }
+#endif
+ sha256_block_data_order_nohw(state, data, num);
+}
+
+#endif // !defined(SHA256_ASM)
+
void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data,
size_t num_blocks) {
diff --git a/crypto/fipsmodule/sha/sha512.c b/crypto/fipsmodule/sha/sha512.c
index 2c7ce31..0f4142c 100644
--- a/crypto/fipsmodule/sha/sha512.c
+++ b/crypto/fipsmodule/sha/sha512.c
@@ -279,7 +279,9 @@
return 1;
}
-#ifndef SHA512_ASM
+#if !defined(SHA512_ASM)
+
+#if !defined(SHA512_ASM_NOHW)
static const uint64_t K512[80] = {
UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd),
UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc),
@@ -341,8 +343,8 @@
#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
// This code should give better results on 32-bit CPU with less than
// ~24 registers, both size and performance wise...
-static void sha512_block_data_order(uint64_t *state, const uint8_t *in,
- size_t num) {
+static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in,
+ size_t num) {
uint64_t A, E, T;
uint64_t X[9 + 80], *F;
int i;
@@ -414,8 +416,8 @@
ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \
} while (0)
-static void sha512_block_data_order(uint64_t *state, const uint8_t *in,
- size_t num) {
+static void sha512_block_data_order_nohw(uint64_t *state, const uint8_t *in,
+ size_t num) {
uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
uint64_t X[16];
int i;
@@ -498,6 +500,25 @@
#endif
+#endif // !SHA512_ASM_NOHW
+
+static void sha512_block_data_order(uint64_t *state, const uint8_t *data,
+ size_t num) {
+#if defined(SHA512_ASM_HW)
+ if (sha512_hw_capable()) {
+ sha512_block_data_order_hw(state, data, num);
+ return;
+ }
+#endif
+#if defined(SHA512_ASM_AVX)
+ if (sha512_avx_capable()) {
+ sha512_block_data_order_avx(state, data, num);
+ return;
+ }
+#endif
+ sha512_block_data_order_nohw(state, data, num);
+}
+
#endif // !SHA512_ASM
#undef Sigma0
diff --git a/crypto/fipsmodule/sha/sha_test.cc b/crypto/fipsmodule/sha/sha_test.cc
index 5029bb0..22856f8 100644
--- a/crypto/fipsmodule/sha/sha_test.cc
+++ b/crypto/fipsmodule/sha/sha_test.cc
@@ -42,41 +42,100 @@
}
}
-#if defined(SHA1_ASM) && defined(SUPPORTS_ABI_TEST)
+#if defined(SUPPORTS_ABI_TEST)
+
TEST(SHATest, SHA1ABI) {
SHA_CTX ctx;
SHA1_Init(&ctx);
static const uint8_t kBuf[SHA_CBLOCK * 8] = {0};
- CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 1);
- CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 2);
- CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 4);
- CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, 8);
+ for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA1_ASM)
+ CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA1_ASM_HW)
+ if (sha1_hw_capable()) {
+ CHECK_ABI(sha1_block_data_order_hw, ctx.h, kBuf, blocks);
+ }
+#endif
+#if defined(SHA1_ASM_AVX2)
+ if (sha1_avx2_capable()) {
+ CHECK_ABI(sha1_block_data_order_avx2, ctx.h, kBuf, blocks);
+ }
+#endif
+#if defined(SHA1_ASM_AVX)
+ if (sha1_avx_capable()) {
+ CHECK_ABI(sha1_block_data_order_avx, ctx.h, kBuf, blocks);
+ return;
+ }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+ if (sha1_ssse3_capable()) {
+ CHECK_ABI(sha1_block_data_order_ssse3, ctx.h, kBuf, blocks);
+ return;
+ }
+#endif
+#if defined(SHA1_ASM_NOHW)
+ CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+ }
}
-#endif // SHA1_ASM && SUPPORTS_ABI_TEST
-#if defined(SHA256_ASM) && defined(SUPPORTS_ABI_TEST)
TEST(SHATest, SHA256ABI) {
SHA256_CTX ctx;
SHA256_Init(&ctx);
static const uint8_t kBuf[SHA256_CBLOCK * 8] = {0};
- CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 1);
- CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 2);
- CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 4);
- CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, 8);
+ for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA256_ASM)
+ CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA256_ASM_HW)
+ if (sha256_hw_capable()) {
+ CHECK_ABI(sha256_block_data_order_hw, ctx.h, kBuf, blocks);
+ }
+#endif
+#if defined(SHA256_ASM_AVX)
+ if (sha256_avx_capable()) {
+ CHECK_ABI(sha256_block_data_order_avx, ctx.h, kBuf, blocks);
+ return;
+ }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+ if (sha256_ssse3_capable()) {
+ CHECK_ABI(sha256_block_data_order_ssse3, ctx.h, kBuf, blocks);
+ return;
+ }
+#endif
+#if defined(SHA256_ASM_NOHW)
+ CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+ }
}
-#endif // SHA256_ASM && SUPPORTS_ABI_TEST
-#if defined(SHA512_ASM) && defined(SUPPORTS_ABI_TEST)
TEST(SHATest, SHA512ABI) {
SHA512_CTX ctx;
SHA512_Init(&ctx);
static const uint8_t kBuf[SHA512_CBLOCK * 4] = {0};
- CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 1);
- CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 2);
- CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 3);
- CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, 4);
+ for (size_t blocks : {1, 2, 3, 4}) {
+#if defined(SHA512_ASM)
+ CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA512_ASM_HW)
+ if (sha512_hw_capable()) {
+ CHECK_ABI(sha512_block_data_order_hw, ctx.h, kBuf, blocks);
+ }
+#endif
+#if defined(SHA512_ASM_AVX)
+ if (sha512_avx_capable()) {
+ CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
+ }
+#endif
+#if defined(SHA512_ASM_NOHW)
+ CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+ }
}
-#endif // SHA512_ASM && SUPPORTS_ABI_TEST
+
+#endif // SUPPORTS_ABI_TEST
diff --git a/crypto/internal.h b/crypto/internal.h
index f2db41c..e9da010 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -1450,6 +1450,15 @@
#endif
}
+// SHA-1 and SHA-256 are defined as a single extension.
+OPENSSL_INLINE int CRYPTO_is_x86_SHA_capable(void) {
+#if defined(__SHA__)
+ return 1;
+#else
+ return (OPENSSL_get_ia32cap(2) & (1u << 29)) != 0;
+#endif
+}
+
#endif // OPENSSL_X86 || OPENSSL_X86_64
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
@@ -1519,6 +1528,41 @@
#endif
}
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA1_capable(void) {
+ // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise
+ // are dealt with independently.
+#if defined(OPENSSL_STATIC_ARMCAP_SHA1) || defined(__ARM_FEATURE_SHA2)
+ return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+ return 0;
+#else
+ return (OPENSSL_get_armcap() & ARMV8_SHA1) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA256_capable(void) {
+ // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise
+ // are dealt with independently.
+#if defined(OPENSSL_STATIC_ARMCAP_SHA256) || defined(__ARM_FEATURE_SHA2)
+ return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+ return 0;
+#else
+ return (OPENSSL_get_armcap() & ARMV8_SHA256) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA512_capable(void) {
+ // There is no |OPENSSL_STATIC_ARMCAP_SHA512|.
+#if defined(__ARM_FEATURE_SHA512)
+ return 1;
+#elif defined(OPENSSL_STATIC_ARMCAP)
+ return 0;
+#else
+ return (OPENSSL_get_armcap() & ARMV8_SHA512) != 0;
+#endif
+}
+
#endif // OPENSSL_ARM || OPENSSL_AARCH64
#if defined(BORINGSSL_DISPATCH_TEST)