bn: Change return type of `bn_mul_mont_*` internals to `void`.
Change-Id: Id8be6697df6a6f6613105c67c96250cc084595b2
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/75647
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index acae4e5..d3c867d 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -121,14 +121,9 @@
bn_mul_mont_nohw:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
- cmp ip,#2
+ @ No return value. Instead, the caller must ensure num >= 2
mov $num,ip @ load num
-#ifdef __thumb2__
- ittt lt
-#endif
- movlt r0,#0
- addlt sp,sp,#2*4
- blt .Labrt
+ @ No return value
stmdb sp!,{r4-r12,lr} @ save 10 registers
@@ -262,8 +257,7 @@
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
- mov r0,#1
-.Labrt:
+ @ No return value
#if __ARM_ARCH>=5
ret @ bx lr
#else
@@ -717,6 +711,7 @@
mov sp,ip
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
+ @ No return value
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl
index 1ce02ee..fe4d8cf 100644
--- a/crypto/fipsmodule/bn/asm/armv8-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl
@@ -60,7 +60,7 @@
$lo1,$hi1,$nj,$m1,$nlo,$nhi,
$ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
-# int bn_mul_mont(
+# void bn_mul_mont(
$rp="x0"; # BN_ULONG *rp,
$ap="x1"; # const BN_ULONG *ap,
$bp="x2"; # const BN_ULONG *bp,
@@ -270,7 +270,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
AARCH64_VALIDATE_LINK_REGISTER
@@ -1044,7 +1044,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
@@ -1505,7 +1505,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index 3de17a4..c3e30cb 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -68,11 +68,9 @@
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
- &xor ("eax","eax");
+ # No return value. Instead, the caller must ensure num >= 4
&mov ("edi",&wparam(5)); # int num
- &cmp ("edi",4);
- &jl (&label("just_leave"));
-
+ # No return value.
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&add ("edi",2); # extra two words on top of tp
@@ -326,8 +324,7 @@
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
- &mov ("eax",1);
-&set_label("just_leave");
+ # No return value
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
index 537b028..c1d4028 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -70,7 +70,7 @@
# output, so this isn't useful anyway.
$addx = 1;
-# int bn_mul_mont_nohw(
+# void bn_mul_mont_nohw(
$rp="%rdi"; # BN_ULONG *rp,
$ap="%rsi"; # const BN_ULONG *ap,
$bp="%rdx"; # const BN_ULONG *bp,
@@ -315,7 +315,7 @@
mov 8(%rsp,$num,8),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
- mov \$1,%rax
+ # No return value
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
@@ -762,7 +762,7 @@
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
.cfi_def_cfa %rsi, 8
- mov \$1,%rax
+ # No return value
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
@@ -785,7 +785,7 @@
}}}
{{{
######################################################################
-# int bn_sqr8x_mont(
+# void bn_sqr8x_mont(
my $rptr="%rdi"; # const BN_ULONG *rptr,
my $aptr="%rsi"; # const BN_ULONG *aptr,
my $mulx_adx_capable="%rdx"; # Different than upstream!
@@ -976,7 +976,7 @@
add \$32,$num
jnz .Lsqr8x_cond_copy
- mov \$1,%rax
+ # No return value
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
@@ -1345,7 +1345,7 @@
mov %rdx,($tptr)
- mov \$1,%rax
+ # No return value
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 0a9997d..6ce2cc7 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -275,14 +275,14 @@
#define OPENSSL_BN_ASM_MONT
// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles
-// inputs of this size and zero otherwise.
+// corresponding field in |BN_MONT_CTX|.
//
// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
-// |BN_MONTGOMERY_MAX_WORDS|.
+// |BN_MONTGOMERY_MAX_WORDS|. Additionally, |num| must be at least 128 /
+// |BN_BITS2|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
@@ -291,39 +291,39 @@
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#if defined(OPENSSL_X86_64)
inline int bn_mulx_adx_capable(void) {
// MULX is in BMI2.
return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable();
}
-int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
inline int bn_mul4x_mont_capable(size_t num) {
return num >= 8 && (num & 3) == 0;
}
-int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
inline int bn_mulx4x_mont_capable(size_t num) {
return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable();
}
-int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
inline int bn_sqr8x_mont_capable(size_t num) {
return num >= 8 && (num & 7) == 0;
}
-int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#elif defined(OPENSSL_ARM)
inline int bn_mul8x_mont_neon_capable(size_t num) {
return (num & 7) == 0 && CRYPTO_is_NEON_capable();
}
-int bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
-int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#endif
#endif // OPENSSL_BN_ASM_MONT
diff --git a/crypto/fipsmodule/bn/montgomery.cc.inc b/crypto/fipsmodule/bn/montgomery.cc.inc
index e73f805..c0adcca 100644
--- a/crypto/fipsmodule/bn/montgomery.cc.inc
+++ b/crypto/fipsmodule/bn/montgomery.cc.inc
@@ -324,7 +324,7 @@
}
#if defined(OPENSSL_BN_ASM_MONT)
- // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+ // |bn_mul_mont| requires at least 128 bits of limbs.
int num = mont->N.width;
if (num >= (128 / BN_BITS2) && a->width == num && b->width == num) {
if (!bn_wexpand(r, num)) {
@@ -333,12 +333,7 @@
// This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
// allocates |num| words on the stack, so |num| cannot be too large.
assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
- if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
- // The check above ensures this won't happen.
- assert(0);
- OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
- return 0;
- }
+ bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num);
r->neg = 0;
r->width = num;
return 1;
@@ -379,11 +374,9 @@
}
#if defined(OPENSSL_BN_ASM_MONT)
- // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+ // |bn_mul_mont| requires at least 128 bits of limbs.
if (num >= (128 / BN_BITS2)) {
- if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
- abort(); // The check above ensures this won't happen.
- }
+ bn_mul_mont(r, a, b, mont->N.d, mont->n0, num);
return;
}
#endif
@@ -404,27 +397,27 @@
}
#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64)
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
if (ap == bp && bn_sqr8x_mont_capable(num)) {
- return bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
+ bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
+ } else if (bn_mulx4x_mont_capable(num)) {
+ bn_mulx4x_mont(rp, ap, bp, np, n0, num);
+ } else if (bn_mul4x_mont_capable(num)) {
+ bn_mul4x_mont(rp, ap, bp, np, n0, num);
+ } else {
+ bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
- if (bn_mulx4x_mont_capable(num)) {
- return bn_mulx4x_mont(rp, ap, bp, np, n0, num);
- }
- if (bn_mul4x_mont_capable(num)) {
- return bn_mul4x_mont(rp, ap, bp, np, n0, num);
- }
- return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif
#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM)
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
- const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+ const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
if (bn_mul8x_mont_neon_capable(num)) {
- return bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+ bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+ } else {
+ bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
- return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif
diff --git a/gen/bcm/armv4-mont-linux.S b/gen/bcm/armv4-mont-linux.S
index 704f607..73e42b5 100644
--- a/gen/bcm/armv4-mont-linux.S
+++ b/gen/bcm/armv4-mont-linux.S
@@ -24,14 +24,9 @@
bn_mul_mont_nohw:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
- cmp ip,#2
+ @ No return value. Instead, the caller must ensure num >= 2
mov r0,ip @ load num
-#ifdef __thumb2__
- ittt lt
-#endif
- movlt r0,#0
- addlt sp,sp,#2*4
- blt .Labrt
+ @ No return value
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
@@ -165,8 +160,7 @@
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
- mov r0,#1
-.Labrt:
+ @ No return value
#if __ARM_ARCH>=5
bx lr @ bx lr
#else
@@ -929,6 +923,7 @@
mov sp,ip
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ @ No return value
bx lr @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
diff --git a/gen/bcm/armv8-mont-apple.S b/gen/bcm/armv8-mont-apple.S
index 9e39c1d..6aad968 100644
--- a/gen/bcm/armv8-mont-apple.S
+++ b/gen/bcm/armv8-mont-apple.S
@@ -206,7 +206,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
AARCH64_VALIDATE_LINK_REGISTER
@@ -965,7 +965,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
@@ -1408,7 +1408,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-linux.S b/gen/bcm/armv8-mont-linux.S
index 168162f..e49322b 100644
--- a/gen/bcm/armv8-mont-linux.S
+++ b/gen/bcm/armv8-mont-linux.S
@@ -206,7 +206,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
AARCH64_VALIDATE_LINK_REGISTER
@@ -965,7 +965,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
@@ -1408,7 +1408,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-win.S b/gen/bcm/armv8-mont-win.S
index b521d49..4091a5a 100644
--- a/gen/bcm/armv8-mont-win.S
+++ b/gen/bcm/armv8-mont-win.S
@@ -208,7 +208,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldr x29,[sp],#64
AARCH64_VALIDATE_LINK_REGISTER
@@ -969,7 +969,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
@@ -1414,7 +1414,7 @@
ldp x19,x20,[x29,#16]
mov sp,x29
ldp x21,x22,[x29,#32]
- mov x0,#1
+ // No return value
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
index a8fd1f9..6e549c7 100644
--- a/gen/bcm/x86-mont-apple.S
+++ b/gen/bcm/x86-mont-apple.S
@@ -14,10 +14,7 @@
pushl %ebx
pushl %esi
pushl %edi
- xorl %eax,%eax
movl 40(%esp),%edi
- cmpl $4,%edi
- jl L000just_leave
leal 20(%esp),%esi
leal 24(%esp),%edx
addl $2,%edi
@@ -40,15 +37,15 @@
leal (%ebp,%eax,1),%esp
movl (%esp),%eax
cmpl %ebp,%esp
- ja L001page_walk
- jmp L002page_walk_done
+ ja L000page_walk
+ jmp L001page_walk_done
.align 4,0x90
-L001page_walk:
+L000page_walk:
leal -4096(%esp),%esp
movl (%esp),%eax
cmpl %ebp,%esp
- ja L001page_walk
-L002page_walk_done:
+ ja L000page_walk
+L001page_walk_done:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -85,7 +82,7 @@
psrlq $32,%mm3
incl %ecx
.align 4,0x90
-L0031st:
+L0021st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -100,7 +97,7 @@
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl L0031st
+ jl L0021st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -114,7 +111,7 @@
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-L004outer:
+L003outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -136,7 +133,7 @@
paddq %mm6,%mm2
incl %ecx
decl %ebx
-L005inner:
+L004inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -153,7 +150,7 @@
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz L005inner
+ jnz L004inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -171,11 +168,11 @@
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle L004outer
+ jle L003outer
emms
- jmp L006common_tail
+ jmp L005common_tail
.align 4,0x90
-L006common_tail:
+L005common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -183,19 +180,19 @@
movl %ebx,%ecx
xorl %edx,%edx
.align 4,0x90
-L007sub:
+L006sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge L007sub
+ jge L006sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp L008copy
+ jmp L007copy
.align 4,0x90
-L008copy:
+L007copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -204,10 +201,8 @@
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge L008copy
+ jge L007copy
movl 24(%esp),%esp
- movl $1,%eax
-L000just_leave:
popl %edi
popl %esi
popl %ebx
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
index 3d3ddb5..21fbee2 100644
--- a/gen/bcm/x86-mont-linux.S
+++ b/gen/bcm/x86-mont-linux.S
@@ -15,10 +15,7 @@
pushl %ebx
pushl %esi
pushl %edi
- xorl %eax,%eax
movl 40(%esp),%edi
- cmpl $4,%edi
- jl .L000just_leave
leal 20(%esp),%esi
leal 24(%esp),%edx
addl $2,%edi
@@ -41,15 +38,15 @@
leal (%ebp,%eax,1),%esp
movl (%esp),%eax
cmpl %ebp,%esp
- ja .L001page_walk
- jmp .L002page_walk_done
+ ja .L000page_walk
+ jmp .L001page_walk_done
.align 16
-.L001page_walk:
+.L000page_walk:
leal -4096(%esp),%esp
movl (%esp),%eax
cmpl %ebp,%esp
- ja .L001page_walk
-.L002page_walk_done:
+ ja .L000page_walk
+.L001page_walk_done:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -86,7 +83,7 @@
psrlq $32,%mm3
incl %ecx
.align 16
-.L0031st:
+.L0021st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -101,7 +98,7 @@
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl .L0031st
+ jl .L0021st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -115,7 +112,7 @@
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-.L004outer:
+.L003outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -137,7 +134,7 @@
paddq %mm6,%mm2
incl %ecx
decl %ebx
-.L005inner:
+.L004inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -154,7 +151,7 @@
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz .L005inner
+ jnz .L004inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -172,11 +169,11 @@
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle .L004outer
+ jle .L003outer
emms
- jmp .L006common_tail
+ jmp .L005common_tail
.align 16
-.L006common_tail:
+.L005common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -184,19 +181,19 @@
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L007sub:
+.L006sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L007sub
+ jge .L006sub
sbbl $0,%eax
movl $-1,%edx
xorl %eax,%edx
- jmp .L008copy
+ jmp .L007copy
.align 16
-.L008copy:
+.L007copy:
movl 32(%esp,%ebx,4),%esi
movl (%edi,%ebx,4),%ebp
movl %ecx,32(%esp,%ebx,4)
@@ -205,10 +202,8 @@
orl %esi,%ebp
movl %ebp,(%edi,%ebx,4)
decl %ebx
- jge .L008copy
+ jge .L007copy
movl 24(%esp),%esp
- movl $1,%eax
-.L000just_leave:
popl %edi
popl %esi
popl %ebx
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
index 931275d..d154078 100644
--- a/gen/bcm/x86-mont-win.asm
+++ b/gen/bcm/x86-mont-win.asm
@@ -21,10 +21,7 @@
push ebx
push esi
push edi
- xor eax,eax
mov edi,DWORD [40+esp]
- cmp edi,4
- jl NEAR L$000just_leave
lea esi,[20+esp]
lea edx,[24+esp]
add edi,2
@@ -47,15 +44,15 @@
lea esp,[eax*1+ebp]
mov eax,DWORD [esp]
cmp esp,ebp
- ja NEAR L$001page_walk
- jmp NEAR L$002page_walk_done
+ ja NEAR L$000page_walk
+ jmp NEAR L$001page_walk_done
align 16
-L$001page_walk:
+L$000page_walk:
lea esp,[esp-4096]
mov eax,DWORD [esp]
cmp esp,ebp
- ja NEAR L$001page_walk
-L$002page_walk_done:
+ ja NEAR L$000page_walk
+L$001page_walk_done:
mov eax,DWORD [esi]
mov ebx,DWORD [4+esi]
mov ecx,DWORD [8+esi]
@@ -92,7 +89,7 @@
psrlq mm3,32
inc ecx
align 16
-L$0031st:
+L$0021st:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -107,7 +104,7 @@
psrlq mm3,32
lea ecx,[1+ecx]
cmp ecx,ebx
- jl NEAR L$0031st
+ jl NEAR L$0021st
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -121,7 +118,7 @@
paddq mm3,mm2
movq [32+ebx*4+esp],mm3
inc edx
-L$004outer:
+L$003outer:
xor ecx,ecx
movd mm4,DWORD [edx*4+edi]
movd mm5,DWORD [esi]
@@ -143,7 +140,7 @@
paddq mm2,mm6
inc ecx
dec ebx
-L$005inner:
+L$004inner:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -160,7 +157,7 @@
paddq mm2,mm6
dec ebx
lea ecx,[1+ecx]
- jnz NEAR L$005inner
+ jnz NEAR L$004inner
mov ebx,ecx
pmuludq mm0,mm4
pmuludq mm1,mm5
@@ -178,11 +175,11 @@
movq [32+ebx*4+esp],mm3
lea edx,[1+edx]
cmp edx,ebx
- jle NEAR L$004outer
+ jle NEAR L$003outer
emms
- jmp NEAR L$006common_tail
+ jmp NEAR L$005common_tail
align 16
-L$006common_tail:
+L$005common_tail:
mov ebp,DWORD [16+esp]
mov edi,DWORD [4+esp]
lea esi,[32+esp]
@@ -190,19 +187,19 @@
mov ecx,ebx
xor edx,edx
align 16
-L$007sub:
+L$006sub:
sbb eax,DWORD [edx*4+ebp]
mov DWORD [edx*4+edi],eax
dec ecx
mov eax,DWORD [4+edx*4+esi]
lea edx,[1+edx]
- jge NEAR L$007sub
+ jge NEAR L$006sub
sbb eax,0
mov edx,-1
xor edx,eax
- jmp NEAR L$008copy
+ jmp NEAR L$007copy
align 16
-L$008copy:
+L$007copy:
mov esi,DWORD [32+ebx*4+esp]
mov ebp,DWORD [ebx*4+edi]
mov DWORD [32+ebx*4+esp],ecx
@@ -211,10 +208,8 @@
or ebp,esi
mov DWORD [ebx*4+edi],ebp
dec ebx
- jge NEAR L$008copy
+ jge NEAR L$007copy
mov esp,DWORD [24+esp]
- mov eax,1
-L$000just_leave:
pop edi
pop esi
pop ebx
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
index d429f7c..27a168d 100644
--- a/gen/bcm/x86_64-mont-apple.S
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -229,7 +229,7 @@
movq 8(%rsp,%r9,8),%rsi
- movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
@@ -662,7 +662,7 @@
jnz L$copy4x
movq 8(%rsp,%r9,8),%rsi
- movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
@@ -853,7 +853,7 @@
addq $32,%r9
jnz L$sqr8x_cond_copy
- movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
@@ -1211,7 +1211,7 @@
movq %rdx,(%rbx)
- movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
index 630bb72..51c4b6c 100644
--- a/gen/bcm/x86_64-mont-linux.S
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -229,7 +229,7 @@
movq 8(%rsp,%r9,8),%rsi
.cfi_def_cfa %rsi,8
- movq $1,%rax
+
movq -48(%rsi),%r15
.cfi_restore %r15
movq -40(%rsi),%r14
@@ -662,7 +662,7 @@
jnz .Lcopy4x
movq 8(%rsp,%r9,8),%rsi
.cfi_def_cfa %rsi, 8
- movq $1,%rax
+
movq -48(%rsi),%r15
.cfi_restore %r15
movq -40(%rsi),%r14
@@ -855,7 +855,7 @@
addq $32,%r9
jnz .Lsqr8x_cond_copy
- movq $1,%rax
+
movq -48(%rsi),%r15
.cfi_restore %r15
movq -40(%rsi),%r14
@@ -1213,7 +1213,7 @@
movq %rdx,(%rbx)
- movq $1,%rax
+
movq -48(%rsi),%r15
.cfi_restore %r15
movq -40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
index 7e54c66..c768d16 100644
--- a/gen/bcm/x86_64-mont-win.asm
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -248,7 +248,7 @@
mov rsi,QWORD[8+r9*8+rsp]
- mov rax,1
+
mov r15,QWORD[((-48))+rsi]
mov r14,QWORD[((-40))+rsi]
@@ -694,7 +694,7 @@
jnz NEAR $L$copy4x
mov rsi,QWORD[8+r9*8+rsp]
- mov rax,1
+
mov r15,QWORD[((-48))+rsi]
mov r14,QWORD[((-40))+rsi]
@@ -898,7 +898,7 @@
add r9,32
jnz NEAR $L$sqr8x_cond_copy
- mov rax,1
+
mov r15,QWORD[((-48))+rsi]
mov r14,QWORD[((-40))+rsi]
@@ -1269,7 +1269,7 @@
mov QWORD[rbx],rdx
- mov rax,1
+
mov r15,QWORD[((-48))+rsi]
mov r14,QWORD[((-40))+rsi]