Move capability checks in sha256-586.pl to C This took a little restructuring because they were previously all one big function. Some notes: - label() and set_label() in x86 perlasm default to function-scoped. But static_label() marks a label as file-scoped, which is why "pic_point" and "K256" work. - There's a pretty sizeable common preamble. I just copied it to each for simplicity. I'm pretty sure some of it is wasted, but it's definitely not all wasted, between loading parameters, setting up stack alignment, and saving the old stack location. But I'm not sure if all those 16 bytes are actually used. Bug: 673 Change-Id: I6e8671d05d07cb4676ecf117dd56e2ed355c5d19 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65874 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: Bob Beck <bbe@google.com>

commit: 469118ca637887427f9727f1aac13c3e5bdee068 [log] [tgz]
author: David Benjamin <davidben@google.com> Sat Jan 27 18:54:15 2024 -0500
committer: Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Feb 21 18:00:51 2024 +0000
tree: eddd2289dde094314b9494481b37cfeac098a6ba
parent: 90f0f05cca750b74c29c4ae8ee1ed800effa23c0 [diff]
diff --git a/crypto/fipsmodule/sha/asm/sha256-586.pl b/crypto/fipsmodule/sha/asm/sha256-586.pl
index ab821e7..ad6afa2 100644
--- a/crypto/fipsmodule/sha/asm/sha256-586.pl
+++ b/crypto/fipsmodule/sha/asm/sha256-586.pl

@@ -84,7 +84,7 @@
 # versions, but BoringSSL is intended to be used with pre-generated perlasm
 # output, so this isn't useful anyway.
 #
-# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
+# TODO(davidben): Enable AVX+BMI2 code after testing by setting $avx to 2.
 $avx = 1;
 
 $avx = 0 unless ($xmm);
@@ -185,9 +185,9 @@
 	 &add	($A,$T);		# h += T
 }
 
-&external_label("OPENSSL_ia32cap_P")		if (!$i386);
+&static_label("K256");
 
-&function_begin("sha256_block_data_order");
+&function_begin("sha256_block_data_order_nohw");
 	&mov	("esi",wparam(0));	# ctx
 	&mov	("edi",wparam(1));	# inp
 	&mov	("eax",wparam(2));	# num
@@ -208,26 +208,6 @@
 	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
 	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
 						if (!$i386 && $xmm) {
-	&picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
-	&mov	("ecx",&DWP(0,"edx"));
-	&mov	("ebx",&DWP(4,"edx"));
-	&mov	("edx",&DWP(8,"edx"))	if ($xmm);
-	&test	("ecx",1<<24);		# check for FXSR
-	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
-	&and	("ecx",1<<30);		# mask "Intel CPU" bit
-	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
-	&test	("edx",1<<29)		if ($shaext);	# check for SHA
-	&jnz	(&label("shaext"))	if ($shaext);
-	&or	("ecx","ebx");
-	&and	("ecx",1<<28|1<<30);
-	&cmp	("ecx",1<<28|1<<30);
-					if ($xmm) {
-	&je	(&label("AVX"))		if ($avx);
-	&test	("ebx",1<<9);		# check for SSSE3
-	&jnz	(&label("SSSE3"));
-					} else {
-	&je	(&label("loop_shrd"));
-					}
 						if ($unroll_after) {
 &set_label("no_xmm");
 	&sub	("eax","edi");
@@ -515,6 +495,8 @@
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 &function_end_A();
 }
+&function_end_B("sha256_block_data_order_nohw");
+
 						if (!$i386 && $xmm) {{{
 if ($shaext) {
 ######################################################################
@@ -533,7 +515,33 @@
 sub sha256msg1	{ sha256op38(0xcc,@_); }
 sub sha256msg2	{ sha256op38(0xcd,@_); }
 
-&set_label("shaext",32);
+&function_begin("sha256_block_data_order_hw");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&sub		("esp",32);
 
 	&movdqu		($ABEF,&QWP(0,$ctx));		# DCBA
@@ -653,14 +661,40 @@
 	&mov		("esp",&DWP(32+12,"esp"));
 	&movdqu		(&QWP(0,$ctx),$ABEF);
 	&movdqu		(&QWP(16,$ctx),$CDGH);
-&function_end_A();
+&function_end("sha256_block_data_order_shaext");
 }
 
 my @X = map("xmm$_",(0..3));
 my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
 my @AH = ($A,$T);
 
-&set_label("SSSE3",32);
+&function_begin("sha256_block_data_order_ssse3");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
 	&mov	($AH[0],&DWP(0,"esi"));
@@ -968,14 +1002,36 @@
 	&jb	(&label("grand_ssse3"));
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
-&function_end_A();
+&function_end("sha256_block_data_order_ssse3");
+
 						if ($avx) {
-&set_label("AVX",32);
-						if ($avx>1) {
-	&and	("edx",1<<8|1<<3);		# check for BMI2+BMI1
-	&cmp	("edx",1<<8|1<<3);
-	&je	(&label("AVX_BMI"));
-						}
+&function_begin("sha256_block_data_order_avx");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	&vzeroall	();
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
@@ -1135,7 +1191,8 @@
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 	&vzeroall	();
-&function_end_A();
+&function_end("sha256_block_data_order_avx");
+
 						if ($avx>1) {
 sub bodyx_00_15 () {			# +10%
 	(
@@ -1172,7 +1229,34 @@
 	);
 }
 
-&set_label("AVX_BMI",32);
+# If enabled, this function should be gated on AVX, BMI1, and BMI2.
+&function_begin("sha256_block_data_order_avx_bmi");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K256);
+	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",6);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+	# TODO(davidben): The preamble above this point comes from the original
+	# merged sha256_block_data_order function, which performed some common
+	# setup and then jumped to the particular SHA-256 implementation. The
+	# parts of the preamble that do not apply to this function can be
+	# removed.
+
 	&lea	("esp",&DWP(-96,"esp"));
 	&vzeroall	();
 	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
@@ -1276,11 +1360,10 @@
 
 	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
 	&vzeroall	();
-&function_end_A();
+&function_end("sha256_block_data_order_avx_bmi");
 						}
 						}
 						}}}
-&function_end_B("sha256_block_data_order");
 
 &asm_finish();
 

diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h
index 7082e64..d1ebbb8 100644
--- a/crypto/fipsmodule/sha/internal.h
+++ b/crypto/fipsmodule/sha/internal.h

@@ -79,6 +79,7 @@
 #elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
 
 #define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
 
 #define SHA1_ASM_SSSE3
 OPENSSL_INLINE int sha1_ssse3_capable(void) {
@@ -103,11 +104,31 @@
 void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
                                size_t num);
 
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
+                                   size_t num);
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  //
+  // TODO(davidben): Should we enable SHAEXT on 32-bit x86?
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
+         CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
+                                 size_t num);
+
 // TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
-#define SHA256_ASM
 #define SHA512_ASM
-void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
-                             size_t num_blocks);
 void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
                              size_t num_blocks);
commit	469118ca637887427f9727f1aac13c3e5bdee068	[log] [tgz]
author	David Benjamin <davidben@google.com>	Sat Jan 27 18:54:15 2024 -0500
committer	Boringssl LUCI CQ <boringssl-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Feb 21 18:00:51 2024 +0000
tree	eddd2289dde094314b9494481b37cfeac098a6ba
parent	90f0f05cca750b74c29c4ae8ee1ed800effa23c0 [diff]