Move capability checks in sha1-586.pl to C

sha256-586.pl and sha512-586.pl have their own unique challenges, so
I'll do them separately.

Bug: 673
Change-Id: Ic9be0454fddf75e7f49bcccd8a86a4ff8862ff67
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65872
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha1-586.pl b/crypto/fipsmodule/sha/asm/sha1-586.pl
index 7952636..4be06e7 100644
--- a/crypto/fipsmodule/sha/asm/sha1-586.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-586.pl
@@ -141,8 +141,6 @@
 # been tested.
 $shaext = 0;
 
-&external_label("OPENSSL_ia32cap_P") if ($xmm);
-
 
 $A="eax";
 $B="ebx";
@@ -318,40 +316,9 @@
 }
 	}
 
-&function_begin("sha1_block_data_order");
-if ($xmm) {
-  &static_label("shaext_shortcut")	if ($shaext);
-  &static_label("ssse3_shortcut");
-  &static_label("avx_shortcut")		if ($ymm);
-  &static_label("K_XX_XX");
+&static_label("K_XX_XX");
 
-	&call	(&label("pic_point"));	# make it PIC!
-  &set_label("pic_point");
-	&blindpop($tmp1);
-	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
-	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-
-	&mov	($A,&DWP(0,$T));
-	&mov	($D,&DWP(4,$T));
-	&test	($D,1<<9);		# check SSSE3 bit
-	&jz	(&label("x86"));
-	&mov	($C,&DWP(8,$T));
-	&test	($A,1<<24);		# check FXSR bit
-	&jz	(&label("x86"));
-	if ($shaext) {
-		&test	($C,1<<29);		# check SHA bit
-		&jnz	(&label("shaext_shortcut"));
-	}
-	if ($ymm) {
-		&and	($D,1<<28);		# mask AVX bit
-		&and	($A,1<<30);		# mask "Intel CPU" bit
-		&or	($A,$D);
-		&cmp	($A,1<<28|1<<30);
-		&je	(&label("avx_shortcut"));
-	}
-	&jmp	(&label("ssse3_shortcut"));
-  &set_label("x86",16);
-}
+&function_begin("sha1_block_data_order_nohw");
 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
 	&mov($T,&wparam(1));	# const void *input
 	&mov($A,&wparam(2));	# size_t num
@@ -417,7 +384,7 @@
 	&jb(&label("loop"));
 
 	&stack_pop(16+3);
-&function_end("sha1_block_data_order");
+&function_end("sha1_block_data_order_nohw");
 
 if ($xmm) {
 if ($shaext) {
@@ -442,12 +409,11 @@
 sub sha1msg1	{ sha1op38(0xc9,@_); }
 sub sha1msg2	{ sha1op38(0xca,@_); }
 
-&function_begin("_sha1_block_data_order_shaext");
+&function_begin("sha1_block_data_order_shaext");
 	&call	(&label("pic_point"));	# make it PIC!
 	&set_label("pic_point");
 	&blindpop($tmp1);
 	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-&set_label("shaext_shortcut");
 	&mov	($ctx,&wparam(0));
 	&mov	("ebx","esp");
 	&mov	($inp,&wparam(1));
@@ -529,7 +495,7 @@
 	&movdqu	(&QWP(0,$ctx),$ABCD)
 	&movd	(&DWP(16,$ctx),$E);
 	&mov	("esp","ebx");
-&function_end("_sha1_block_data_order_shaext");
+&function_end("sha1_block_data_order_shaext");
 }
 ######################################################################
 # The SSSE3 implementation.
@@ -565,12 +531,11 @@
 my $_rol=sub { &rol(@_) };
 my $_ror=sub { &ror(@_) };
 
-&function_begin("_sha1_block_data_order_ssse3");
+&function_begin("sha1_block_data_order_ssse3");
 	&call	(&label("pic_point"));	# make it PIC!
 	&set_label("pic_point");
 	&blindpop($tmp1);
 	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-&set_label("ssse3_shortcut");
 
 	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
 	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
@@ -1093,7 +1058,7 @@
 	&mov	(&DWP(12,@T[1]),$D);
 	&mov	(&DWP(16,@T[1]),$E);
 
-&function_end("_sha1_block_data_order_ssse3");
+&function_end("sha1_block_data_order_ssse3");
 
 $rx=0;	# reset
 
@@ -1108,12 +1073,11 @@
 my $_rol=sub { &shld(@_[0],@_) };
 my $_ror=sub { &shrd(@_[0],@_) };
 
-&function_begin("_sha1_block_data_order_avx");
+&function_begin("sha1_block_data_order_avx");
 	&call	(&label("pic_point"));	# make it PIC!
 	&set_label("pic_point");
 	&blindpop($tmp1);
 	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-&set_label("avx_shortcut");
 	&vzeroall();
 
 	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
@@ -1466,7 +1430,7 @@
 	&mov	(&DWP(8,@T[1]),$C);
 	&mov	(&DWP(12,@T[1]),$D);
 	&mov	(&DWP(16,@T[1]),$E);
-&function_end("_sha1_block_data_order_avx");
+&function_end("sha1_block_data_order_avx");
 }
 &set_label("K_XX_XX",64);
 &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 4a2f081..7082e64 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -26,20 +26,7 @@
 // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
 // defined in assembly.
 
-#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
-
-#define SHA1_ASM
-#define SHA256_ASM
-#define SHA512_ASM
-
-void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
-                           size_t num_blocks);
-void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
-                             size_t num_blocks);
-void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
-                             size_t num_blocks);
-
-#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
 
 #define SHA1_ASM_NOHW
 #define SHA256_ASM_NOHW
@@ -89,6 +76,41 @@
   return CRYPTO_is_ARMv8_SHA512_capable();
 }
 
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
+
+#define SHA1_ASM_NOHW
+
+#define SHA1_ASM_SSSE3
+OPENSSL_INLINE int sha1_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha1_block_data_order_ssse3(uint32_t state[5], const uint8_t *data,
+                                 size_t num);
+
+#define SHA1_ASM_AVX
+OPENSSL_INLINE int sha1_avx_capable(void) {
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  //
+  // TODO(davidben): Should we enable SHAEXT on 32-bit x86?
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
+         CRYPTO_is_FXSR_capable();
+}
+void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
+                               size_t num);
+
+// TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
+#define SHA256_ASM
+#define SHA512_ASM
+void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
+                             size_t num_blocks);
+void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
+                             size_t num_blocks);
+
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
 
 #define SHA1_ASM_NOHW