bn: Move x86-64 argument-based dispatching of bn_mul_mont to C.

Take a step towards moving the OPENSSL_ia32cap_P usage out of
x86_64-mont.pl. The MULX+ADX dispatching within |bn_sqr8x_mont| is
deferred to a future change.

Bug: 673
Change-Id: I8768bb33d2c289fd7ccf8743b51721e55ab74f35
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/65527
Reviewed-by: Bob Beck <bbe@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
index be4c69b..875a5a5 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -65,7 +65,7 @@
 # output, so this isn't useful anyway.
 $addx = 1;
 
-# int bn_mul_mont(
+# int bn_mul_mont_nohw(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
 $bp="%rdx";	# const BN_ULONG *bp,
@@ -87,33 +87,15 @@
 
 .extern	OPENSSL_ia32cap_P
 
-.globl	bn_mul_mont
-.type	bn_mul_mont,\@function,6
+.globl	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,\@function,6
 .align	16
-bn_mul_mont:
+bn_mul_mont_nohw:
 .cfi_startproc
 	_CET_ENDBR
 	mov	${num}d,${num}d
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	test	\$3,${num}d
-	jnz	.Lmul_enter
-	cmp	\$8,${num}d
-	jb	.Lmul_enter
-___
-$code.=<<___ if ($addx);
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	mov	8(%r11),%r11d
-___
-$code.=<<___;
-	cmp	$ap,$bp
-	jne	.Lmul4x_enter
-	test	\$7,${num}d
-	jz	.Lsqr8x_enter
-	jmp	.Lmul4x_enter
-
-.align	16
-.Lmul_enter:
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -348,27 +330,21 @@
 .Lmul_epilogue:
 	ret
 .cfi_endproc
-.size	bn_mul_mont,.-bn_mul_mont
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
 ___
 {{{
 my @A=("%r10","%r11");
 my @N=("%r13","%rdi");
 $code.=<<___;
+.globl	bn_mul4x_mont
 .type	bn_mul4x_mont,\@function,6
 .align	16
 bn_mul4x_mont:
 .cfi_startproc
+	_CET_ENDBR
 	mov	${num}d,${num}d
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmul4x_enter:
-___
-$code.=<<___ if ($addx);
-	and	\$0x80100,%r11d
-	cmp	\$0x80100,%r11d
-	je	.Lmulx4x_enter
-___
-$code.=<<___;
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -806,7 +782,7 @@
 }}}
 {{{
 ######################################################################
-# void bn_sqr8x_mont(
+# int bn_sqr8x_mont(
 my $rptr="%rdi";	# const BN_ULONG *rptr,
 my $aptr="%rsi";	# const BN_ULONG *aptr,
 my $bptr="%rdx";	# not used
@@ -825,13 +801,15 @@
 $code.=<<___;
 .extern	bn_sqr8x_internal		# see x86_64-mont5 module
 
+.globl	bn_sqr8x_mont
 .type	bn_sqr8x_mont,\@function,6
 .align	32
 bn_sqr8x_mont:
 .cfi_startproc
+	_CET_ENDBR
+	mov	${num}d,${num}d
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lsqr8x_enter:
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -1024,13 +1002,14 @@
 my $bp="%rdx";	# original value
 
 $code.=<<___;
+.globl	bn_mulx4x_mont
 .type	bn_mulx4x_mont,\@function,6
 .align	32
 bn_mulx4x_mont:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmulx4x_enter:
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -1535,9 +1514,9 @@
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_bn_mul_mont
-	.rva	.LSEH_end_bn_mul_mont
-	.rva	.LSEH_info_bn_mul_mont
+	.rva	.LSEH_begin_bn_mul_mont_nohw
+	.rva	.LSEH_end_bn_mul_mont_nohw
+	.rva	.LSEH_info_bn_mul_mont_nohw
 
 	.rva	.LSEH_begin_bn_mul4x_mont
 	.rva	.LSEH_end_bn_mul4x_mont
@@ -1555,7 +1534,7 @@
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_bn_mul_mont:
+.LSEH_info_bn_mul_mont_nohw:
 	.byte	9,0,0,0
 	.rva	mul_handler
 	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index 08c4719..13042ea 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -2881,10 +2881,33 @@
     a[0] = 1;
     b[0] = 42;
 
+#if defined(OPENSSL_X86_64)
+    if (bn_mulx4x_mont_capable(words)) {
+      CHECK_ABI(bn_mulx4x_mont, r.data(), a.data(), b.data(), mont->N.d,
+                mont->n0, words);
+      CHECK_ABI(bn_mulx4x_mont, r.data(), a.data(), a.data(), mont->N.d,
+                mont->n0, words);
+    }
+    if (bn_mul4x_mont_capable(words)) {
+      CHECK_ABI(bn_mul4x_mont, r.data(), a.data(), b.data(), mont->N.d,
+                mont->n0, words);
+      CHECK_ABI(bn_mul4x_mont, r.data(), a.data(), a.data(), mont->N.d,
+                mont->n0, words);
+    }
+    CHECK_ABI(bn_mul_mont_nohw, r.data(), a.data(), b.data(), mont->N.d,
+              mont->n0, words);
+    CHECK_ABI(bn_mul_mont_nohw, r.data(), a.data(), a.data(), mont->N.d,
+              mont->n0, words);
+    if (bn_sqr8x_mont_capable(words)) {
+      CHECK_ABI(bn_sqr8x_mont, r.data(), a.data(), a.data(), mont->N.d,
+                mont->n0, words);
+    }
+#else
     CHECK_ABI(bn_mul_mont, r.data(), a.data(), b.data(), mont->N.d, mont->n0,
               words);
     CHECK_ABI(bn_mul_mont, r.data(), a.data(), a.data(), mont->N.d, mont->n0,
               words);
+#endif
   }
 }
 #endif   // OPENSSL_BN_ASM_MONT && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index d556488..4de201f 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -400,6 +400,29 @@
 // inputs.
 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                 const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+
+#if defined(OPENSSL_X86_64)
+int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                     const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+OPENSSL_INLINE int bn_mul4x_mont_capable(size_t num) {
+  return (num >= 8) && ((num & 3) == 0);
+}
+int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+OPENSSL_INLINE int bn_mulx4x_mont_capable(size_t num) {
+  // MULX is in BMI2.
+  return bn_mul4x_mont_capable(num) && CRYPTO_is_BMI2_capable() &&
+         CRYPTO_is_ADX_capable();
+}
+int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+OPENSSL_INLINE int bn_sqr8x_mont_capable(size_t num) {
+  return (num >= 8) && ((num & 7) == 0);
+}
+int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *unused_bp,
+                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+#endif // defined(OPENSSL_X86_64)
+
 #endif
 
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c
index f219d42..86b64c6 100644
--- a/crypto/fipsmodule/bn/montgomery.c
+++ b/crypto/fipsmodule/bn/montgomery.c
@@ -504,3 +504,20 @@
   }
   OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
 }
+
+#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64)
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, size_t num)
+{
+  if (ap == bp && bn_sqr8x_mont_capable(num)) {
+    return bn_sqr8x_mont(rp, ap, bp, np, n0, num);
+  }
+  if (bn_mulx4x_mont_capable(num)) {
+    return bn_mulx4x_mont(rp, ap, bp, np, n0, num);
+  }
+  if (bn_mul4x_mont_capable(num)) {
+    return bn_mul4x_mont(rp, ap, bp, np, n0, num);
+  }
+  return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#endif