bn: Move dispatching logic from x86_64-mont5.pl to C.
CL originally uploaded by Brian Smith at
https://boringssl-review.googlesource.com/c/boringssl/+/65569
Bug: 673
Change-Id: If84d34cae1c44cc883fc292dd048542e2b341f41
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/68347
Reviewed-by: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
index 88d98af..a944739 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -50,7 +50,7 @@
# output, so this isn't useful anyway.
$addx = 1;
-# int bn_mul_mont_gather5(
+# int bn_mul_mont_gather5_nohw(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
$bp="%rdx"; # const BN_ULONG *bp,
@@ -72,29 +72,17 @@
$code=<<___;
.text
-.extern OPENSSL_ia32cap_P
-
-.globl bn_mul_mont_gather5
-.type bn_mul_mont_gather5,\@function,6
+.globl bn_mul_mont_gather5_nohw
+.type bn_mul_mont_gather5_nohw,\@function,6
.align 64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
.cfi_startproc
_CET_ENDBR
+ # num is declared as an int, a 32-bit parameter, so the upper half is
+ # undefined. Zero the upper half to normalize it.
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
- test \$7,${num}d
- jnz .Lmul_enter
-___
-$code.=<<___ if ($addx);
- leaq OPENSSL_ia32cap_P(%rip),%r11
- mov 8(%r11),%r11d
-___
-$code.=<<___;
- jmp .Lmul4x_enter
-
-.align 16
-.Lmul_enter:
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
push %rbx
.cfi_push %rbx
@@ -454,27 +442,21 @@
.Lmul_epilogue:
ret
.cfi_endproc
-.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
+.globl bn_mul4x_mont_gather5
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
.cfi_startproc
+ _CET_ENDBR
.byte 0x67
mov %rsp,%rax
.cfi_def_cfa_register %rax
-.Lmul4x_enter:
-___
-$code.=<<___ if ($addx);
- and \$0x80108,%r11d
- cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
- je .Lmulx4x_enter
-___
-$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
@@ -490,6 +472,9 @@
.Lmul4x_prologue:
.byte 0x67
+ # num is declared as an int, a 32-bit parameter, so the upper half is
+ # undefined. It is important that this write to ${num}, which zeros the
+ # upper half, predates the first access.
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
@@ -1079,7 +1064,7 @@
}}}
{{{
######################################################################
-# void bn_power5(
+# void bn_power5_nohw(
my $rptr="%rdi"; # BN_ULONG *rptr,
my $aptr="%rsi"; # const BN_ULONG *aptr,
my $bptr="%rdx"; # const BN_ULONG *table,
@@ -1094,23 +1079,14 @@
my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
$code.=<<___;
-.globl bn_power5
-.type bn_power5,\@function,6
+.globl bn_power5_nohw
+.type bn_power5_nohw,\@function,6
.align 32
-bn_power5:
+bn_power5_nohw:
.cfi_startproc
_CET_ENDBR
mov %rsp,%rax
.cfi_def_cfa_register %rax
-___
-$code.=<<___ if ($addx);
- leaq OPENSSL_ia32cap_P(%rip),%r11
- mov 8(%r11),%r11d
- and \$0x80108,%r11d
- cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
- je .Lpowerx5_enter
-___
-$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
@@ -1125,6 +1101,9 @@
.cfi_push %r15
.Lpower5_prologue:
+ # num is declared as an int, a 32-bit parameter, so the upper half is
+ # undefined. It is important that this write to ${num}, which zeros the
+ # upper half, come before the first access.
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
neg $num
@@ -1233,7 +1212,7 @@
.Lpower5_epilogue:
ret
.cfi_endproc
-.size bn_power5,.-bn_power5
+.size bn_power5_nohw,.-bn_power5_nohw
.globl bn_sqr8x_internal
.hidden bn_sqr8x_internal
@@ -2108,13 +2087,14 @@
my $bp="%rdx"; # restore original value
$code.=<<___;
+.globl bn_mulx4x_mont_gather5
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
.cfi_startproc
+ _CET_ENDBR
mov %rsp,%rax
.cfi_def_cfa_register %rax
-.Lmulx4x_enter:
push %rbx
.cfi_push %rbx
push %rbp
@@ -2129,6 +2109,9 @@
.cfi_push %r15
.Lmulx4x_prologue:
+ # num is declared as an int, a 32-bit parameter, so the upper half is
+ # undefined. It is important that this write to ${num}, which zeros the
+ # upper half, predates the first access.
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
@@ -2583,7 +2566,7 @@
___
}{
######################################################################
-# void bn_power5(
+# void bn_powerx5(
my $rptr="%rdi"; # BN_ULONG *rptr,
my $aptr="%rsi"; # const BN_ULONG *aptr,
my $bptr="%rdx"; # const BN_ULONG *table,
@@ -2598,13 +2581,14 @@
my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
$code.=<<___;
+.globl bn_powerx5
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
.cfi_startproc
+ _CET_ENDBR
mov %rsp,%rax
.cfi_def_cfa_register %rax
-.Lpowerx5_enter:
push %rbx
.cfi_push %rbx
push %rbp
@@ -2619,6 +2603,9 @@
.cfi_push %r15
.Lpowerx5_prologue:
+ # num is declared as an int, a 32-bit parameter, so the upper half is
+ # undefined. It is important that this write to ${num}, which zeros the
+ # upper half, predates the first access.
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
@@ -3705,17 +3692,17 @@
.section .pdata
.align 4
- .rva .LSEH_begin_bn_mul_mont_gather5
- .rva .LSEH_end_bn_mul_mont_gather5
- .rva .LSEH_info_bn_mul_mont_gather5
+ .rva .LSEH_begin_bn_mul_mont_gather5_nohw
+ .rva .LSEH_end_bn_mul_mont_gather5_nohw
+ .rva .LSEH_info_bn_mul_mont_gather5_nohw
.rva .LSEH_begin_bn_mul4x_mont_gather5
.rva .LSEH_end_bn_mul4x_mont_gather5
.rva .LSEH_info_bn_mul4x_mont_gather5
- .rva .LSEH_begin_bn_power5
- .rva .LSEH_end_bn_power5
- .rva .LSEH_info_bn_power5
+ .rva .LSEH_begin_bn_power5_nohw
+ .rva .LSEH_end_bn_power5_nohw
+ .rva .LSEH_info_bn_power5_nohw
___
$code.=<<___ if ($addx);
.rva .LSEH_begin_bn_mulx4x_mont_gather5
@@ -3733,7 +3720,7 @@
.section .xdata
.align 8
-.LSEH_info_bn_mul_mont_gather5:
+.LSEH_info_bn_mul_mont_gather5_nohw:
.byte 9,0,0,0
.rva mul_handler
.rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
@@ -3743,7 +3730,7 @@
.rva mul_handler
.rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
-.LSEH_info_bn_power5:
+.LSEH_info_bn_power5_nohw:
.byte 9,0,0,0
.rva mul_handler
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
diff --git a/crypto/fipsmodule/bn/bn_test.cc b/crypto/fipsmodule/bn/bn_test.cc
index fcc59e0..710b60f 100644
--- a/crypto/fipsmodule/bn/bn_test.cc
+++ b/crypto/fipsmodule/bn/bn_test.cc
@@ -2946,17 +2946,35 @@
}
CHECK_ABI(bn_gather5, r.data(), words, table.data(), 13);
- CHECK_ABI(bn_mul_mont_gather5, r.data(), r.data(), table.data(), m->d,
+ if (bn_mulx4x_mont_gather5_capable(words)) {
+ CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), r.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ CHECK_ABI(bn_mulx4x_mont_gather5, r.data(), a.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ }
+ if (bn_mul4x_mont_gather5_capable(words)) {
+ CHECK_ABI(bn_mul4x_mont_gather5, r.data(), r.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ CHECK_ABI(bn_mul4x_mont_gather5, r.data(), a.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ }
+ CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), r.data(), table.data(), m->d,
mont->n0, words, 13);
- CHECK_ABI(bn_mul_mont_gather5, r.data(), a.data(), table.data(), m->d,
+ CHECK_ABI(bn_mul_mont_gather5_nohw, r.data(), a.data(), table.data(), m->d,
mont->n0, words, 13);
- if (words % 8 == 0) {
- CHECK_ABI(bn_power5, r.data(), r.data(), table.data(), m->d, mont->n0,
+ if (bn_powerx5_capable(words)) {
+ CHECK_ABI(bn_powerx5, r.data(), r.data(), table.data(), m->d, mont->n0,
words, 13);
- CHECK_ABI(bn_power5, r.data(), a.data(), table.data(), m->d, mont->n0,
+ CHECK_ABI(bn_powerx5, r.data(), a.data(), table.data(), m->d, mont->n0,
words, 13);
}
+ if (bn_power5_capable(words)) {
+ CHECK_ABI(bn_power5_nohw, r.data(), r.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ CHECK_ABI(bn_power5_nohw, r.data(), a.data(), table.data(), m->d,
+ mont->n0, words, 13);
+ }
}
}
#endif // OPENSSL_BN_ASM_MONT5 && SUPPORTS_ABI_TEST
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index 53c6142..9030aa8 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -119,6 +119,50 @@
#include "internal.h"
#include "rsaz_exp.h"
+#if defined(OPENSSL_BN_ASM_MONT5)
+
+// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
+// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
+// words long and represented in Montgomery form. |n0| is a pointer to the
+// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
+// 16 bytes. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_mul_mont_gather5(
+ BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power) {
+ if (bn_mulx4x_mont_gather5_capable(num)) {
+ bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power);
+ } else if (bn_mul4x_mont_gather5_capable(num)) {
+ bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power);
+ } else {
+ bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power);
+ }
+}
+
+// bn_power5 squares |ap| five times and multiplies it by the value stored at
+// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
+// values are |num| words long and represented in Montgomery form. |n0| is a
+// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
+// by 8. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+ const BN_ULONG *np, const BN_ULONG *n0, int num,
+ int power) {
+ assert(bn_power5_capable(num));
+ if (bn_powerx5_capable(num)) {
+ bn_powerx5(rp, ap, table, np, n0, num, power);
+ } else {
+ bn_power5_nohw(rp, ap, table, np, n0, num, power);
+ }
+}
+
+#endif // defined(OPENSSL_BN_ASM_MONT5)
int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
int i, bits, ret = 0;
@@ -1079,7 +1123,7 @@
// Scan the exponent one window at a time starting from the most
// significant bits.
- if (top & 7) {
+ if (!bn_power5_capable(top)) {
while (bits >= 0) {
for (wvalue = 0, i = 0; i < 5; i++, bits--) {
wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 0271160..679e249 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -438,18 +438,26 @@
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define OPENSSL_BN_ASM_MONT5
-// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
-// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
-// words long and represented in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
-// 16 bytes. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
- const BN_ULONG *table, const BN_ULONG *np,
- const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_mul_mont_gather5|. See
+// |bn_mul_mont_gather5| for details.
+OPENSSL_INLINE int bn_mul4x_mont_gather5_capable(int num) {
+ return (num & 7) == 0;
+}
+void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_mulx4x_mont_gather5_capable(int num) {
+ return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() &&
+ CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
+
+void bn_mul_mont_gather5_nohw(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *table, const BN_ULONG *np,
+ const BN_ULONG *n0, int num, int power);
// bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
// |table| are |num| words long. |power| must be less than 32 and is treated as
@@ -463,17 +471,19 @@
// is treated as secret. |table| must be aligned to at least 16 bytes.
void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
-// bn_power5 squares |ap| five times and multiplies it by the value stored at
-// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
-// values are |num| words long and represented in Montgomery form. |n0| is a
-// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
-// by 8. |power| must be less than 32 and is treated as secret.
-//
-// WARNING: This function implements Almost Montgomery Multiplication from
-// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
-// However, even if they are fully reduced, the output may not be.
-void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
- const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+// The following functions implement |bn_power5|. See |bn_power5| for details.
+void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+ const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
+OPENSSL_INLINE int bn_power5_capable(int num) { return (num & 7) == 0; }
+
+OPENSSL_INLINE int bn_powerx5_capable(int num) {
+ return bn_power5_capable(num) && CRYPTO_is_ADX_capable() &&
+ CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable();
+}
+void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+ const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+
#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64
uint64_t bn_mont_n0(const BIGNUM *n);
diff --git a/gen/bcm/x86_64-mont5-apple.S b/gen/bcm/x86_64-mont5-apple.S
index bd63d91..5cf770f 100644
--- a/gen/bcm/x86_64-mont5-apple.S
+++ b/gen/bcm/x86_64-mont5-apple.S
@@ -6,26 +6,18 @@
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__)
.text
-
-
-.globl _bn_mul_mont_gather5
-.private_extern _bn_mul_mont_gather5
+.globl _bn_mul_mont_gather5_nohw
+.private_extern _bn_mul_mont_gather5_nohw
.p2align 6
-_bn_mul_mont_gather5:
+_bn_mul_mont_gather5_nohw:
_CET_ENDBR
+
+
movl %r9d,%r9d
movq %rsp,%rax
- testl $7,%r9d
- jnz L$mul_enter
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 8(%r11),%r11d
- jmp L$mul4x_enter
-
-.p2align 4
-L$mul_enter:
movd 8(%rsp),%xmm5
pushq %rbx
@@ -452,17 +444,16 @@
ret
+.globl _bn_mul4x_mont_gather5
+.private_extern _bn_mul4x_mont_gather5
.p2align 5
-bn_mul4x_mont_gather5:
+_bn_mul4x_mont_gather5:
+_CET_ENDBR
.byte 0x67
movq %rsp,%rax
-L$mul4x_enter:
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je L$mulx4x_enter
pushq %rbx
pushq %rbp
@@ -478,6 +469,9 @@
L$mul4x_prologue:
.byte 0x67
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
@@ -1087,20 +1081,15 @@
jmp L$sqr4x_sub_entry
-.globl _bn_power5
-.private_extern _bn_power5
+.globl _bn_power5_nohw
+.private_extern _bn_power5_nohw
.p2align 5
-_bn_power5:
+_bn_power5_nohw:
_CET_ENDBR
movq %rsp,%rax
- leaq _OPENSSL_ia32cap_P(%rip),%r11
- movl 8(%r11),%r11d
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je L$powerx5_enter
pushq %rbx
pushq %rbp
@@ -1115,6 +1104,9 @@
L$power5_prologue:
+
+
+
shll $3,%r9d
leal (%r9,%r9,2),%r10d
negq %r9
@@ -2066,13 +2058,15 @@
ret
+.globl _bn_mulx4x_mont_gather5
+.private_extern _bn_mulx4x_mont_gather5
.p2align 5
-bn_mulx4x_mont_gather5:
+_bn_mulx4x_mont_gather5:
+_CET_ENDBR
movq %rsp,%rax
-L$mulx4x_enter:
pushq %rbx
pushq %rbp
@@ -2087,6 +2081,9 @@
L$mulx4x_prologue:
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
@@ -2603,13 +2600,15 @@
jmp L$sqrx4x_sub_entry
+.globl _bn_powerx5
+.private_extern _bn_powerx5
.p2align 5
-bn_powerx5:
+_bn_powerx5:
+_CET_ENDBR
movq %rsp,%rax
-L$powerx5_enter:
pushq %rbx
pushq %rbp
@@ -2624,6 +2623,9 @@
L$powerx5_prologue:
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
diff --git a/gen/bcm/x86_64-mont5-linux.S b/gen/bcm/x86_64-mont5-linux.S
index 14ab4f7..dcc02fc 100644
--- a/gen/bcm/x86_64-mont5-linux.S
+++ b/gen/bcm/x86_64-mont5-linux.S
@@ -6,27 +6,18 @@
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__)
.text
-.extern OPENSSL_ia32cap_P
-.hidden OPENSSL_ia32cap_P
-
-.globl bn_mul_mont_gather5
-.hidden bn_mul_mont_gather5
-.type bn_mul_mont_gather5,@function
+.globl bn_mul_mont_gather5_nohw
+.hidden bn_mul_mont_gather5_nohw
+.type bn_mul_mont_gather5_nohw,@function
.align 64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
.cfi_startproc
_CET_ENDBR
+
+
movl %r9d,%r9d
movq %rsp,%rax
.cfi_def_cfa_register %rax
- testl $7,%r9d
- jnz .Lmul_enter
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl 8(%r11),%r11d
- jmp .Lmul4x_enter
-
-.align 16
-.Lmul_enter:
movd 8(%rsp),%xmm5
pushq %rbx
.cfi_offset %rbx,-16
@@ -452,18 +443,17 @@
.Lmul_epilogue:
ret
.cfi_endproc
-.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw
+.globl bn_mul4x_mont_gather5
+.hidden bn_mul4x_mont_gather5
.type bn_mul4x_mont_gather5,@function
.align 32
bn_mul4x_mont_gather5:
.cfi_startproc
+_CET_ENDBR
.byte 0x67
movq %rsp,%rax
.cfi_def_cfa_register %rax
-.Lmul4x_enter:
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -479,6 +469,9 @@
.Lmul4x_prologue:
.byte 0x67
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
@@ -1088,20 +1081,15 @@
jmp .Lsqr4x_sub_entry
.cfi_endproc
.size mul4x_internal,.-mul4x_internal
-.globl bn_power5
-.hidden bn_power5
-.type bn_power5,@function
+.globl bn_power5_nohw
+.hidden bn_power5_nohw
+.type bn_power5_nohw,@function
.align 32
-bn_power5:
+bn_power5_nohw:
.cfi_startproc
_CET_ENDBR
movq %rsp,%rax
.cfi_def_cfa_register %rax
- leaq OPENSSL_ia32cap_P(%rip),%r11
- movl 8(%r11),%r11d
- andl $0x80108,%r11d
- cmpl $0x80108,%r11d
- je .Lpowerx5_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -1116,6 +1104,9 @@
.cfi_offset %r15,-56
.Lpower5_prologue:
+
+
+
shll $3,%r9d
leal (%r9,%r9,2),%r10d
negq %r9
@@ -1224,7 +1215,7 @@
.Lpower5_epilogue:
ret
.cfi_endproc
-.size bn_power5,.-bn_power5
+.size bn_power5_nohw,.-bn_power5_nohw
.globl bn_sqr8x_internal
.hidden bn_sqr8x_internal
@@ -2067,13 +2058,15 @@
ret
.cfi_endproc
.size __bn_post4x_internal,.-__bn_post4x_internal
+.globl bn_mulx4x_mont_gather5
+.hidden bn_mulx4x_mont_gather5
.type bn_mulx4x_mont_gather5,@function
.align 32
bn_mulx4x_mont_gather5:
.cfi_startproc
+_CET_ENDBR
movq %rsp,%rax
.cfi_def_cfa_register %rax
-.Lmulx4x_enter:
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -2088,6 +2081,9 @@
.cfi_offset %r15,-56
.Lmulx4x_prologue:
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
@@ -2604,13 +2600,15 @@
jmp .Lsqrx4x_sub_entry
.cfi_endproc
.size mulx4x_internal,.-mulx4x_internal
+.globl bn_powerx5
+.hidden bn_powerx5
.type bn_powerx5,@function
.align 32
bn_powerx5:
.cfi_startproc
+_CET_ENDBR
movq %rsp,%rax
.cfi_def_cfa_register %rax
-.Lpowerx5_enter:
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -2625,6 +2623,9 @@
.cfi_offset %r15,-56
.Lpowerx5_prologue:
+
+
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
diff --git a/gen/bcm/x86_64-mont5-win.asm b/gen/bcm/x86_64-mont5-win.asm
index 46aae51..3b12405 100644
--- a/gen/bcm/x86_64-mont5-win.asm
+++ b/gen/bcm/x86_64-mont5-win.asm
@@ -14,16 +14,14 @@
section .text code align=64
-EXTERN OPENSSL_ia32cap_P
-
-global bn_mul_mont_gather5
+global bn_mul_mont_gather5_nohw
ALIGN 64
-bn_mul_mont_gather5:
+bn_mul_mont_gather5_nohw:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
-$L$SEH_begin_bn_mul_mont_gather5:
+$L$SEH_begin_bn_mul_mont_gather5_nohw:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
@@ -34,17 +32,11 @@
_CET_ENDBR
+
+
mov r9d,r9d
mov rax,rsp
- test r9d,7
- jnz NEAR $L$mul_enter
- lea r11,[OPENSSL_ia32cap_P]
- mov r11d,DWORD[8+r11]
- jmp NEAR $L$mul4x_enter
-
-ALIGN 16
-$L$mul_enter:
movd xmm5,DWORD[56+rsp]
push rbx
@@ -472,7 +464,8 @@
mov rsi,QWORD[16+rsp]
ret
-$L$SEH_end_bn_mul_mont_gather5:
+$L$SEH_end_bn_mul_mont_gather5_nohw:
+global bn_mul4x_mont_gather5
ALIGN 32
bn_mul4x_mont_gather5:
@@ -489,13 +482,10 @@
+_CET_ENDBR
DB 0x67
mov rax,rsp
-$L$mul4x_enter:
- and r11d,0x80108
- cmp r11d,0x80108
- je NEAR $L$mulx4x_enter
push rbx
push rbp
@@ -511,6 +501,9 @@
$L$mul4x_prologue:
DB 0x67
+
+
+
shl r9d,3
lea r10,[r9*2+r9]
neg r9
@@ -1122,14 +1115,14 @@
jmp NEAR $L$sqr4x_sub_entry
-global bn_power5
+global bn_power5_nohw
ALIGN 32
-bn_power5:
+bn_power5_nohw:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
mov rax,rsp
-$L$SEH_begin_bn_power5:
+$L$SEH_begin_bn_power5_nohw:
mov rdi,rcx
mov rsi,rdx
mov rdx,r8
@@ -1142,11 +1135,6 @@
_CET_ENDBR
mov rax,rsp
- lea r11,[OPENSSL_ia32cap_P]
- mov r11d,DWORD[8+r11]
- and r11d,0x80108
- cmp r11d,0x80108
- je NEAR $L$powerx5_enter
push rbx
push rbp
@@ -1161,6 +1149,9 @@
$L$power5_prologue:
+
+
+
shl r9d,3
lea r10d,[r9*2+r9]
neg r9
@@ -1271,7 +1262,7 @@
mov rsi,QWORD[16+rsp]
ret
-$L$SEH_end_bn_power5:
+$L$SEH_end_bn_power5_nohw:
global bn_sqr8x_internal
@@ -2113,6 +2104,7 @@
ret
+global bn_mulx4x_mont_gather5
ALIGN 32
bn_mulx4x_mont_gather5:
@@ -2129,9 +2121,9 @@
+_CET_ENDBR
mov rax,rsp
-$L$mulx4x_enter:
push rbx
push rbp
@@ -2146,6 +2138,9 @@
$L$mulx4x_prologue:
+
+
+
shl r9d,3
lea r10,[r9*2+r9]
neg r9
@@ -2664,6 +2659,7 @@
jmp NEAR $L$sqrx4x_sub_entry
+global bn_powerx5
ALIGN 32
bn_powerx5:
@@ -2680,9 +2676,9 @@
+_CET_ENDBR
mov rax,rsp
-$L$powerx5_enter:
push rbx
push rbp
@@ -2697,6 +2693,9 @@
$L$powerx5_prologue:
+
+
+
shl r9d,3
lea r10,[r9*2+r9]
neg r9
@@ -3804,17 +3803,17 @@
section .pdata rdata align=4
ALIGN 4
- DD $L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
- DD $L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
- DD $L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
+ DD $L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase
+ DD $L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase
+ DD $L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase
DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
- DD $L$SEH_begin_bn_power5 wrt ..imagebase
- DD $L$SEH_end_bn_power5 wrt ..imagebase
- DD $L$SEH_info_bn_power5 wrt ..imagebase
+ DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase
+ DD $L$SEH_end_bn_power5_nohw wrt ..imagebase
+ DD $L$SEH_info_bn_power5_nohw wrt ..imagebase
DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
@@ -3828,7 +3827,7 @@
section .xdata rdata align=8
ALIGN 8
-$L$SEH_info_bn_mul_mont_gather5:
+$L$SEH_info_bn_mul_mont_gather5_nohw:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
@@ -3838,7 +3837,7 @@
DD mul_handler wrt ..imagebase
DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
ALIGN 8
-$L$SEH_info_bn_power5:
+$L$SEH_info_bn_power5_nohw:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase