bn: Move dispatching logic from x86_64-mont5.pl to C.

CL originally uploaded by Brian Smith at
https://boringssl-review.googlesource.com/c/boringssl/+/65569

Bug: 673
Change-Id: If84d34cae1c44cc883fc292dd048542e2b341f41
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68347
Reviewed-by: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
index 88d98af..a944739 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -50,7 +50,7 @@
 # output, so this isn't useful anyway.
 $addx = 1;
 
-# int bn_mul_mont_gather5(
+# int bn_mul_mont_gather5_nohw(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
 $bp="%rdx";	# const BN_ULONG *bp,
@@ -72,29 +72,17 @@
 $code=<<___;
 .text
 
-.extern	OPENSSL_ia32cap_P
-
-.globl	bn_mul_mont_gather5
-.type	bn_mul_mont_gather5,\@function,6
+.globl	bn_mul_mont_gather5_nohw
+.type	bn_mul_mont_gather5_nohw,\@function,6
 .align	64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
 .cfi_startproc
 	_CET_ENDBR
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. Zero the upper half to normalize it.
 	mov	${num}d,${num}d
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	test	\$7,${num}d
-	jnz	.Lmul_enter
-___
-$code.=<<___ if ($addx);
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	mov	8(%r11),%r11d
-___
-$code.=<<___;
-	jmp	.Lmul4x_enter
-
-.align	16
-.Lmul_enter:
 	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
 	push	%rbx
 .cfi_push	%rbx
@@ -454,27 +442,21 @@
 .Lmul_epilogue:
 	ret
 .cfi_endproc
-.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.size	bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
 ___
 {{{
 my @A=("%r10","%r11");
 my @N=("%r13","%rdi");
 $code.=<<___;
+.globl	bn_mul4x_mont_gather5
 .type	bn_mul4x_mont_gather5,\@function,6
 .align	32
 bn_mul4x_mont_gather5:
 .cfi_startproc
+	_CET_ENDBR
 	.byte	0x67
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmul4x_enter:
-___
-$code.=<<___ if ($addx);
-	and	\$0x80108,%r11d
-	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
-	je	.Lmulx4x_enter
-___
-$code.=<<___;
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -490,6 +472,9 @@
 .Lmul4x_prologue:
 
 	.byte	0x67
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num			# -$num
@@ -1079,7 +1064,7 @@
 }}}
 {{{
 ######################################################################
-# void bn_power5(
+# void bn_power5_nohw(
 my $rptr="%rdi";	# BN_ULONG *rptr,
 my $aptr="%rsi";	# const BN_ULONG *aptr,
 my $bptr="%rdx";	# const BN_ULONG *table,
@@ -1094,23 +1079,14 @@
 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
 
 $code.=<<___;
-.globl	bn_power5
-.type	bn_power5,\@function,6
+.globl	bn_power5_nohw
+.type	bn_power5_nohw,\@function,6
 .align	32
-bn_power5:
+bn_power5_nohw:
 .cfi_startproc
 	_CET_ENDBR
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-___
-$code.=<<___ if ($addx);
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	mov	8(%r11),%r11d
-	and	\$0x80108,%r11d
-	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
-	je	.Lpowerx5_enter
-___
-$code.=<<___;
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -1125,6 +1101,9 @@
 .cfi_push	%r15
 .Lpower5_prologue:
 
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, come before the first access.
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10d	# 3*$num
 	neg	$num
@@ -1233,7 +1212,7 @@
 .Lpower5_epilogue:
 	ret
 .cfi_endproc
-.size	bn_power5,.-bn_power5
+.size	bn_power5_nohw,.-bn_power5_nohw
 
 .globl	bn_sqr8x_internal
 .hidden	bn_sqr8x_internal
@@ -2108,13 +2087,14 @@
 my $bp="%rdx";	# restore original value
 
 $code.=<<___;
+.globl	bn_mulx4x_mont_gather5
 .type	bn_mulx4x_mont_gather5,\@function,6
 .align	32
 bn_mulx4x_mont_gather5:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmulx4x_enter:
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -2129,6 +2109,9 @@
 .cfi_push	%r15
 .Lmulx4x_prologue:
 
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num			# -$num
@@ -2583,7 +2566,7 @@
 ___
 }{
 ######################################################################
-# void bn_power5(
+# void bn_powerx5(
 my $rptr="%rdi";	# BN_ULONG *rptr,
 my $aptr="%rsi";	# const BN_ULONG *aptr,
 my $bptr="%rdx";	# const BN_ULONG *table,
@@ -2598,13 +2581,14 @@
 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
 
 $code.=<<___;
+.globl	bn_powerx5
 .type	bn_powerx5,\@function,6
 .align	32
 bn_powerx5:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lpowerx5_enter:
 	push	%rbx
 .cfi_push	%rbx
 	push	%rbp
@@ -2619,6 +2603,9 @@
 .cfi_push	%r15
 .Lpowerx5_prologue:
 
+	# num is declared as an int, a 32-bit parameter, so the upper half is
+	# undefined. It is important that this write to ${num}, which zeros the
+	# upper half, predates the first access.
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
 	neg	$num
@@ -3705,17 +3692,17 @@
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_bn_mul_mont_gather5
-	.rva	.LSEH_end_bn_mul_mont_gather5
-	.rva	.LSEH_info_bn_mul_mont_gather5
+	.rva	.LSEH_begin_bn_mul_mont_gather5_nohw
+	.rva	.LSEH_end_bn_mul_mont_gather5_nohw
+	.rva	.LSEH_info_bn_mul_mont_gather5_nohw
 
 	.rva	.LSEH_begin_bn_mul4x_mont_gather5
 	.rva	.LSEH_end_bn_mul4x_mont_gather5
 	.rva	.LSEH_info_bn_mul4x_mont_gather5
 
-	.rva	.LSEH_begin_bn_power5
-	.rva	.LSEH_end_bn_power5
-	.rva	.LSEH_info_bn_power5
+	.rva	.LSEH_begin_bn_power5_nohw
+	.rva	.LSEH_end_bn_power5_nohw
+	.rva	.LSEH_info_bn_power5_nohw
 ___
 $code.=<<___ if ($addx);
 	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
@@ -3733,7 +3720,7 @@
 
 .section	.xdata
 .align	8
-.LSEH_info_bn_mul_mont_gather5:
+.LSEH_info_bn_mul_mont_gather5_nohw:
 	.byte	9,0,0,0
 	.rva	mul_handler
 	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
@@ -3743,7 +3730,7 @@
 	.rva	mul_handler
 	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
 .align	8
-.LSEH_info_bn_power5:
+.LSEH_info_bn_power5_nohw:
 	.byte	9,0,0,0
 	.rva	mul_handler
 	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index fcc59e0..710b60f 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -2946,17 +2946,35 @@
     }
     CHECK_ABI(bn_gather5, r.data(), words, table.data(), 13);
 
-    CHECK_ABI(bn_mul_mont_gather5, r.data(), r.data(), table.data(), m->d,
+    if (bn_mulx4x_mont_gather5_capable(words)) {
+      CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), r.data(), table.data(), m->d,
+                mont->n0, words, 13);
+      CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), a.data(), table.data(), m->d,
+                mont->n0, words, 13);
+    }
+    if (bn_mul4x_mont_gather5_capable(words)) {
+      CHECK_ABI(bn_mul4x_mont_gather5, r.data(), r.data(), table.data(), m->d,
+                mont->n0, words, 13);
+      CHECK_ABI(bn_mul4x_mont_gather5, r.data(), a.data(), table.data(), m->d,
+                mont->n0, words, 13);
+    }
+    CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), r.data(), table.data(), m->d,
               mont->n0, words, 13);
-    CHECK_ABI(bn_mul_mont_gather5, r.data(), a.data(), table.data(), m->d,
+    CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), a.data(), table.data(), m->d,
               mont->n0, words, 13);
 
-    if (words % 8 == 0) {
-      CHECK_ABI(bn_power5, r.data(), r.data(), table.data(), m->d, mont->n0,
+    if (bn_powerx5_capable(words)) {
+      CHECK_ABI(bn_powerx5, r.data(), r.data(), table.data(), m->d, mont->n0,
                 words, 13);
-      CHECK_ABI(bn_power5, r.data(), a.data(), table.data(), m->d, mont->n0,
+      CHECK_ABI(bn_powerx5, r.data(), a.data(), table.data(), m->d, mont->n0,
                 words, 13);
     }
+    if (bn_power5_capable(words)) {
+      CHECK_ABI(bn_power5_nohw, r.data(), r.data(), table.data(), m->d,
+                mont->n0, words, 13);
+      CHECK_ABI(bn_power5_nohw, r.data(), a.data(), table.data(), m->d,
+                mont->n0, words, 13);
+    }
   }
 }
 #endif  // OPENSSL_BN_ASM_MONT5 && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index 53c6142..9030aa8 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -119,6 +119,50 @@
 #include "internal.h"
 #include "rsaz_exp.h"
 
+#if defined(OPENSSL_BN_ASM_MONT5)
+
+// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
+// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
+// words long and represented in Montgomery form. |n0| is a pointer to the
+// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
+// 16 bytes. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_mul_mont_gather5(
+    BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np,
+    const BN_ULONG *n0, int num, int power) {
+  if (bn_mulx4x_mont_gather5_capable(num)) {
+    bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power);
+  } else if (bn_mul4x_mont_gather5_capable(num)) {
+    bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power);
+  } else {
+    bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power);
+  }
+}
+
+// bn_power5 squares |ap| five times and multiplies it by the value stored at
+// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
+// values are |num| words long and represented in Montgomery form. |n0| is a
+// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
+// by 8. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                      const BN_ULONG *np, const BN_ULONG *n0, int num,
+                      int power) {
+  assert(bn_power5_capable(num));
+  if (bn_powerx5_capable(num)) {
+    bn_powerx5(rp, ap, table, np, n0, num, power);
+  } else {
+    bn_power5_nohw(rp, ap, table, np, n0, num, power);
+  }
+}
+
+#endif // defined(OPENSSL_BN_ASM_MONT5)
 
 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
   int i, bits, ret = 0;
@@ -1079,7 +1123,7 @@
 
     // Scan the exponent one window at a time starting from the most
     // significant bits.
-    if (top & 7) {
+    if (!bn_power5_capable(top)) {
       while (bits >= 0) {
         for (wvalue = 0, i = 0; i < 5; i++, bits--) {
           wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 0271160..679e249 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -438,18 +438,26 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
 #define OPENSSL_BN_ASM_MONT5
 
-// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
-// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
-// words long and represented in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
-// 16 bytes. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
-                         const BN_ULONG *table, const BN_ULONG *np,
-                         const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_mul_mont_gather5|. See
+// |bn_mul_mont_gather5| for details.
+OPENSSL_INLINE int bn_mul4x_mont_gather5_capable(int num) {
+  return (num & 7) == 0;
+}
+void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+                           const BN_ULONG *table, const BN_ULONG *np,
+                           const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_mulx4x_mont_gather5_capable(int num) {
+  return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+                            const BN_ULONG *table, const BN_ULONG *np,
+                            const BN_ULONG *n0, int num, int power);
+
+void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap,
+                              const BN_ULONG *table, const BN_ULONG *np,
+                              const BN_ULONG *n0, int num, int power);
 
 // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
 // |table| are |num| words long. |power| must be less than 32 and is treated as
@@ -463,17 +471,19 @@
 // is treated as secret. |table| must be aligned to at least 16 bytes.
 void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
 
-// bn_power5 squares |ap| five times and multiplies it by the value stored at
-// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
-// values are |num| words long and represented in Montgomery form. |n0| is a
-// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
-// by 8. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
-               const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_power5|. See |bn_power5| for details.
+void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                    const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_power5_capable(int num) { return (num & 7) == 0; }
+
+OPENSSL_INLINE int bn_powerx5_capable(int num) {
+  return bn_power5_capable(num) && CRYPTO_is_ADX_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+                const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
 #endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64
 
 uint64_t bn_mont_n0(const BIGNUM *n);
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
index bd63d91..5cf770f 100644
--- a/gen/bcm/x86_64-mont5-apple.S
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -6,26 +6,18 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
 .text	
 
-
-
-.globl	_bn_mul_mont_gather5
-.private_extern _bn_mul_mont_gather5
+.globl	_bn_mul_mont_gather5_nohw
+.private_extern _bn_mul_mont_gather5_nohw
 
 .p2align	6
-_bn_mul_mont_gather5:
+_bn_mul_mont_gather5_nohw:
 
 _CET_ENDBR
+
+
 	movl	%r9d,%r9d
 	movq	%rsp,%rax
 
-	testl	$7,%r9d
-	jnz	L$mul_enter
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	jmp	L$mul4x_enter
-
-.p2align	4
-L$mul_enter:
 	movd	8(%rsp),%xmm5
 	pushq	%rbx
 
@@ -452,17 +444,16 @@
 	ret
 
 
+.globl	_bn_mul4x_mont_gather5
+.private_extern _bn_mul4x_mont_gather5
 
 .p2align	5
-bn_mul4x_mont_gather5:
+_bn_mul4x_mont_gather5:
 
+_CET_ENDBR
 .byte	0x67
 	movq	%rsp,%rax
 
-L$mul4x_enter:
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	L$mulx4x_enter
 	pushq	%rbx
 
 	pushq	%rbp
@@ -478,6 +469,9 @@
 L$mul4x_prologue:
 
 .byte	0x67
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -1087,20 +1081,15 @@
 	jmp	L$sqr4x_sub_entry
 
 
-.globl	_bn_power5
-.private_extern _bn_power5
+.globl	_bn_power5_nohw
+.private_extern _bn_power5_nohw
 
 .p2align	5
-_bn_power5:
+_bn_power5_nohw:
 
 _CET_ENDBR
 	movq	%rsp,%rax
 
-	leaq	_OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	L$powerx5_enter
 	pushq	%rbx
 
 	pushq	%rbp
@@ -1115,6 +1104,9 @@
 
 L$power5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
 	negq	%r9
@@ -2066,13 +2058,15 @@
 	ret
 
 
+.globl	_bn_mulx4x_mont_gather5
+.private_extern _bn_mulx4x_mont_gather5
 
 .p2align	5
-bn_mulx4x_mont_gather5:
+_bn_mulx4x_mont_gather5:
 
+_CET_ENDBR
 	movq	%rsp,%rax
 
-L$mulx4x_enter:
 	pushq	%rbx
 
 	pushq	%rbp
@@ -2087,6 +2081,9 @@
 
 L$mulx4x_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -2603,13 +2600,15 @@
 	jmp	L$sqrx4x_sub_entry
 
 
+.globl	_bn_powerx5
+.private_extern _bn_powerx5
 
 .p2align	5
-bn_powerx5:
+_bn_powerx5:
 
+_CET_ENDBR
 	movq	%rsp,%rax
 
-L$powerx5_enter:
 	pushq	%rbx
 
 	pushq	%rbp
@@ -2624,6 +2623,9 @@
 
 L$powerx5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
index 14ab4f7..dcc02fc 100644
--- a/gen/bcm/x86_64-mont5-linux.S
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -6,27 +6,18 @@
 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
 .text	
 
-.extern	OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
-
-.globl	bn_mul_mont_gather5
-.hidden bn_mul_mont_gather5
-.type	bn_mul_mont_gather5,@function
+.globl	bn_mul_mont_gather5_nohw
+.hidden bn_mul_mont_gather5_nohw
+.type	bn_mul_mont_gather5_nohw,@function
 .align	64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
 .cfi_startproc	
 _CET_ENDBR
+
+
 	movl	%r9d,%r9d
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	testl	$7,%r9d
-	jnz	.Lmul_enter
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	jmp	.Lmul4x_enter
-
-.align	16
-.Lmul_enter:
 	movd	8(%rsp),%xmm5
 	pushq	%rbx
 .cfi_offset	%rbx,-16
@@ -452,18 +443,17 @@
 .Lmul_epilogue:
 	ret
 .cfi_endproc	
-.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.size	bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
+.globl	bn_mul4x_mont_gather5
+.hidden bn_mul4x_mont_gather5
 .type	bn_mul4x_mont_gather5,@function
 .align	32
 bn_mul4x_mont_gather5:
 .cfi_startproc	
+_CET_ENDBR
 .byte	0x67
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmul4x_enter:
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	.Lmulx4x_enter
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -479,6 +469,9 @@
 .Lmul4x_prologue:
 
 .byte	0x67
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -1088,20 +1081,15 @@
 	jmp	.Lsqr4x_sub_entry
 .cfi_endproc	
 .size	mul4x_internal,.-mul4x_internal
-.globl	bn_power5
-.hidden bn_power5
-.type	bn_power5,@function
+.globl	bn_power5_nohw
+.hidden bn_power5_nohw
+.type	bn_power5_nohw,@function
 .align	32
-bn_power5:
+bn_power5_nohw:
 .cfi_startproc	
 _CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-	leaq	OPENSSL_ia32cap_P(%rip),%r11
-	movl	8(%r11),%r11d
-	andl	$0x80108,%r11d
-	cmpl	$0x80108,%r11d
-	je	.Lpowerx5_enter
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -1116,6 +1104,9 @@
 .cfi_offset	%r15,-56
 .Lpower5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
 	negq	%r9
@@ -1224,7 +1215,7 @@
 .Lpower5_epilogue:
 	ret
 .cfi_endproc	
-.size	bn_power5,.-bn_power5
+.size	bn_power5_nohw,.-bn_power5_nohw
 
 .globl	bn_sqr8x_internal
 .hidden bn_sqr8x_internal
@@ -2067,13 +2058,15 @@
 	ret
 .cfi_endproc	
 .size	__bn_post4x_internal,.-__bn_post4x_internal
+.globl	bn_mulx4x_mont_gather5
+.hidden bn_mulx4x_mont_gather5
 .type	bn_mulx4x_mont_gather5,@function
 .align	32
 bn_mulx4x_mont_gather5:
 .cfi_startproc	
+_CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lmulx4x_enter:
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -2088,6 +2081,9 @@
 .cfi_offset	%r15,-56
 .Lmulx4x_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -2604,13 +2600,15 @@
 	jmp	.Lsqrx4x_sub_entry
 .cfi_endproc	
 .size	mulx4x_internal,.-mulx4x_internal
+.globl	bn_powerx5
+.hidden bn_powerx5
 .type	bn_powerx5,@function
 .align	32
 bn_powerx5:
 .cfi_startproc	
+_CET_ENDBR
 	movq	%rsp,%rax
 .cfi_def_cfa_register	%rax
-.Lpowerx5_enter:
 	pushq	%rbx
 .cfi_offset	%rbx,-16
 	pushq	%rbp
@@ -2625,6 +2623,9 @@
 .cfi_offset	%r15,-56
 .Lpowerx5_prologue:
 
+
+
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
index 46aae51..3b12405 100644
--- a/gen/bcm/x86_64-mont5-win.asm
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -14,16 +14,14 @@
 section	.text code align=64
 
 
-EXTERN	OPENSSL_ia32cap_P
-
-global	bn_mul_mont_gather5
+global	bn_mul_mont_gather5_nohw
 
 ALIGN	64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
 	mov	QWORD[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD[16+rsp],rsi
 	mov	rax,rsp
-$L$SEH_begin_bn_mul_mont_gather5:
+$L$SEH_begin_bn_mul_mont_gather5_nohw:
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
@@ -34,17 +32,11 @@
 
 
 _CET_ENDBR
+
+
 	mov	r9d,r9d
 	mov	rax,rsp
 
-	test	r9d,7
-	jnz	NEAR $L$mul_enter
-	lea	r11,[OPENSSL_ia32cap_P]
-	mov	r11d,DWORD[8+r11]
-	jmp	NEAR $L$mul4x_enter
-
-ALIGN	16
-$L$mul_enter:
 	movd	xmm5,DWORD[56+rsp]
 	push	rbx
 
@@ -472,7 +464,8 @@
 	mov	rsi,QWORD[16+rsp]
 	ret
 
-$L$SEH_end_bn_mul_mont_gather5:
+$L$SEH_end_bn_mul_mont_gather5_nohw:
+global	bn_mul4x_mont_gather5
 
 ALIGN	32
 bn_mul4x_mont_gather5:
@@ -489,13 +482,10 @@
 
 
 
+_CET_ENDBR
 	DB	0x67
 	mov	rax,rsp
 
-$L$mul4x_enter:
-	and	r11d,0x80108
-	cmp	r11d,0x80108
-	je	NEAR $L$mulx4x_enter
 	push	rbx
 
 	push	rbp
@@ -511,6 +501,9 @@
 $L$mul4x_prologue:
 
 	DB	0x67
+
+
+
 	shl	r9d,3
 	lea	r10,[r9*2+r9]
 	neg	r9
@@ -1122,14 +1115,14 @@
 	jmp	NEAR $L$sqr4x_sub_entry
 
 
-global	bn_power5
+global	bn_power5_nohw
 
 ALIGN	32
-bn_power5:
+bn_power5_nohw:
 	mov	QWORD[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD[16+rsp],rsi
 	mov	rax,rsp
-$L$SEH_begin_bn_power5:
+$L$SEH_begin_bn_power5_nohw:
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
@@ -1142,11 +1135,6 @@
 _CET_ENDBR
 	mov	rax,rsp
 
-	lea	r11,[OPENSSL_ia32cap_P]
-	mov	r11d,DWORD[8+r11]
-	and	r11d,0x80108
-	cmp	r11d,0x80108
-	je	NEAR $L$powerx5_enter
 	push	rbx
 
 	push	rbp
@@ -1161,6 +1149,9 @@
 
 $L$power5_prologue:
 
+
+
+
 	shl	r9d,3
 	lea	r10d,[r9*2+r9]
 	neg	r9
@@ -1271,7 +1262,7 @@
 	mov	rsi,QWORD[16+rsp]
 	ret
 
-$L$SEH_end_bn_power5:
+$L$SEH_end_bn_power5_nohw:
 
 global	bn_sqr8x_internal
 
@@ -2113,6 +2104,7 @@
 	ret
 
 
+global	bn_mulx4x_mont_gather5
 
 ALIGN	32
 bn_mulx4x_mont_gather5:
@@ -2129,9 +2121,9 @@
 
 
 
+_CET_ENDBR
 	mov	rax,rsp
 
-$L$mulx4x_enter:
 	push	rbx
 
 	push	rbp
@@ -2146,6 +2138,9 @@
 
 $L$mulx4x_prologue:
 
+
+
+
 	shl	r9d,3
 	lea	r10,[r9*2+r9]
 	neg	r9
@@ -2664,6 +2659,7 @@
 	jmp	NEAR $L$sqrx4x_sub_entry
 
 
+global	bn_powerx5
 
 ALIGN	32
 bn_powerx5:
@@ -2680,9 +2676,9 @@
 
 
 
+_CET_ENDBR
 	mov	rax,rsp
 
-$L$powerx5_enter:
 	push	rbx
 
 	push	rbp
@@ -2697,6 +2693,9 @@
 
 $L$powerx5_prologue:
 
+
+
+
 	shl	r9d,3
 	lea	r10,[r9*2+r9]
 	neg	r9
@@ -3804,17 +3803,17 @@
 
 section	.pdata rdata align=4
 ALIGN	4
-	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
-	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
-	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
+	DD	$L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase
+	DD	$L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase
+	DD	$L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase
 
 	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
 	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
 	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
 
-	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
-	DD	$L$SEH_end_bn_power5 wrt ..imagebase
-	DD	$L$SEH_info_bn_power5 wrt ..imagebase
+	DD	$L$SEH_begin_bn_power5_nohw wrt ..imagebase
+	DD	$L$SEH_end_bn_power5_nohw wrt ..imagebase
+	DD	$L$SEH_info_bn_power5_nohw wrt ..imagebase
 	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
 	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
 	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
@@ -3828,7 +3827,7 @@
 
 section	.xdata rdata align=8
 ALIGN	8
-$L$SEH_info_bn_mul_mont_gather5:
+$L$SEH_info_bn_mul_mont_gather5_nohw:
 	DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
 	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
@@ -3838,7 +3837,7 @@
 	DD	mul_handler wrt ..imagebase
 	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
 ALIGN	8
-$L$SEH_info_bn_power5:
+$L$SEH_info_bn_power5_nohw:
 	DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
 	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase