Move capability checks in sha256-586.pl to C

This took a little restructuring because they were previously all one
big function. Some notes:

- label() and set_label() in x86 perlasm default to function-scoped. But
  static_label() marks a label as file-scoped, which is why "pic_point"
  and "K256" work.

- There's a pretty sizeable common preamble. I just copied it to each
  for simplicity. I'm pretty sure some of it is wasted, but it's
  definitely not all wasted, between loading parameters, setting up
  stack alignment, and saving the old stack location. But I'm not sure
  if all those 16 bytes are actually used.

Bug: 673
Change-Id: I6e8671d05d07cb4676ecf117dd56e2ed355c5d19
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65874
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/sha/asm/sha256-586.pl b/crypto/fipsmodule/sha/asm/sha256-586.pl
index ab821e7..ad6afa2 100644
--- a/crypto/fipsmodule/sha/asm/sha256-586.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-586.pl
@@ -84,7 +84,7 @@
 # versions, but BoringSSL is intended to be used with pre-generated perlasm
 # output, so this isn't useful anyway.
 #
-# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
+# TODO(davidben): Enable AVX+BMI2 code after testing by setting $avx to 2.
 $avx = 1;
 
 $avx = 0 unless ($xmm);
@@ -185,9 +185,9 @@
 	 &add	($A,$T);		# h += T
 }
 
-&external_label("OPENSSL_ia32cap_P")		if (!$i386);
+&static_label("K256");
 
-&function_begin("sha256_block_data_order");
+&function_begin("sha256_block_data_order_nohw");
 	&mov	("esi",wparam(0));	# ctx
 	&mov	("edi",wparam(1));	# inp
 	&mov	("eax",wparam(2));	# num
@@ -208,26 +208,6 @@
 	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
 	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
 						if (!$i386 && $xmm) {
-	&picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
-	&mov	("ecx",&DWP(0,"edx"));
-	&mov	("ebx",&DWP(4,"edx"));
-	&mov	("edx",&DWP(8,"edx"))	if ($xmm);
-	&test	("ecx",1<<24);		# check for FXSR
-	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
-	&and	("ecx",1<<30);		# mask "Intel CPU" bit
-	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
-	&test	("edx",1<<29)		if ($shaext);	# check for SHA
-	&jnz	(&label("shaext"))	if ($shaext);
-	&or	("ecx","ebx");
-	&and	("ecx",1<<28|1<<30);
-	&cmp	("ecx",1<<28|1<<30);
-					if ($xmm) {
-	&je	(&label("AVX"))		if ($avx);
-	&test	("ebx",1<<9);		# check for SSSE3
-	&jnz	(&label("SSSE3"));
-					} else {
-	&je	(&label("loop_shrd"));
-					}
 						if ($unroll_after) {
 &set_label("no_xmm");
 	&sub	("eax","edi");
@@ -515,6 +495,8 @@
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 &function_end_A();
 }
+&function_end_B("sha256_block_data_order_nohw");
+
 						if (!$i386 && $xmm) {{{
 if ($shaext) {
 ######################################################################
@@ -533,7 +515,33 @@
 sub sha256msg1	{ sha256op38(0xcc,@_); }
 sub sha256msg2	{ sha256op38(0xcd,@_); }
 
-&set_label("shaext",32);
+&function_begin("sha256_block_data_order_hw");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&sub		("esp",32);
 
 	&movdqu		($ABEF,&QWP(0,$ctx));		# DCBA
@@ -653,14 +661,40 @@
 	&mov		("esp",&DWP(32+12,"esp"));
 	&movdqu		(&QWP(0,$ctx),$ABEF);
 	&movdqu		(&QWP(16,$ctx),$CDGH);
-&function_end_A();
+&function_end("sha256_block_data_order_shaext");
 }
 
 my @X = map("xmm$_",(0..3));
 my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
 my @AH = ($A,$T);
 
-&set_label("SSSE3",32);
+&function_begin("sha256_block_data_order_ssse3");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
 	&mov	($AH[0],&DWP(0,"esi"));
@@ -968,14 +1002,36 @@
 	&jb	(&label("grand_ssse3"));
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
-&function_end_A();
+&function_end("sha256_block_data_order_ssse3");
+
 						if ($avx) {
-&set_label("AVX",32);
-						if ($avx>1) {
-	&and	("edx",1<<8|1<<3);		# check for BMI2+BMI1
-	&cmp	("edx",1<<8|1<<3);
-	&je	(&label("AVX_BMI"));
-						}
+&function_begin("sha256_block_data_order_avx");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	&vzeroall	();
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
@@ -1135,7 +1191,8 @@
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 	&vzeroall	();
-&function_end_A();
+&function_end("sha256_block_data_order_avx");
+
 						if ($avx>1) {
 sub bodyx_00_15 () {			# +10%
 	(
@@ -1172,7 +1229,34 @@
 	);
 }
 
-&set_label("AVX_BMI",32);
+# If enabled, this function should be gated on AVX, BMI1, and BMI2.
+&function_begin("sha256_block_data_order_avx_bmi");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	&vzeroall	();
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
@@ -1276,11 +1360,10 @@
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 	&vzeroall	();
-&function_end_A();
+&function_end("sha256_block_data_order_avx_bmi");
 						}
 						}
 						}}}
-&function_end_B("sha256_block_data_order");
 
 &asm_finish();
 
diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 7082e64..d1ebbb8 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h
@@ -79,6 +79,7 @@
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
 
 #define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
 
 #define SHA1_ASM_SSSE3
 OPENSSL_INLINE int sha1_ssse3_capable(void) {
@@ -103,11 +104,31 @@
 void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
                                size_t num);
 
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
+                                   size_t num);
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  //
+  // TODO(davidben): Should we enable SHAEXT on 32-bit x86?
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
+         CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
+                                 size_t num);
+
 // TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
-#define SHA256_ASM
 #define SHA512_ASM
-void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
-                             size_t num_blocks);
 void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
                              size_t num_blocks);