bn: Change return type of `bn_mul_mont_*` internals to `void`. Change-Id: Id8be6697df6a6f6613105c67c96250cc084595b2 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/75647 Reviewed-by: Bob Beck <bbe@google.com> Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl index acae4e5..d3c867d 100644 --- a/crypto/fipsmodule/bn/asm/armv4-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -121,14 +121,9 @@ bn_mul_mont_nohw: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block - cmp ip,#2 + @ No return value. Instead, the caller must ensure num >= 2 mov $num,ip @ load num -#ifdef __thumb2__ - ittt lt -#endif - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt + @ No return value stmdb sp!,{r4-r12,lr} @ save 10 registers @@ -262,8 +257,7 @@ add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: + @ No return value #if __ARM_ARCH>=5 ret @ bx lr #else @@ -717,6 +711,7 @@ mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} + @ No return value ret @ bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif
diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl index 1ce02ee..fe4d8cf 100644 --- a/crypto/fipsmodule/bn/asm/armv8-mont.pl +++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl
@@ -60,7 +60,7 @@ $lo1,$hi1,$nj,$m1,$nlo,$nhi, $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); -# int bn_mul_mont( +# void bn_mul_mont( $rp="x0"; # BN_ULONG *rp, $ap="x1"; # const BN_ULONG *ap, $bp="x2"; # const BN_ULONG *bp, @@ -270,7 +270,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER @@ -1044,7 +1044,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] @@ -1505,7 +1505,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80]
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl index 3de17a4..c3e30cb 100755 --- a/crypto/fipsmodule/bn/asm/x86-mont.pl +++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -68,11 +68,9 @@ $_bpend=&DWP(4*7,"esp"); $frame=32; # size of above frame rounded up to 16n - &xor ("eax","eax"); + # No return value. Instead, the caller must ensure num >= 4 &mov ("edi",&wparam(5)); # int num - &cmp ("edi",4); - &jl (&label("just_leave")); - + # No return value. &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap &add ("edi",2); # extra two words on top of tp @@ -326,8 +324,7 @@ &jge (&label("copy")); &mov ("esp",$_sp); # pull saved stack pointer - &mov ("eax",1); -&set_label("just_leave"); + # No return value &function_end("bn_mul_mont"); &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl index 537b028..c1d4028 100755 --- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl +++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -70,7 +70,7 @@ # output, so this isn't useful anyway. $addx = 1; -# int bn_mul_mont_nohw( +# void bn_mul_mont_nohw( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, @@ -315,7 +315,7 @@ mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 - mov \$1,%rax + # No return value mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 @@ -762,7 +762,7 @@ $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi, 8 - mov \$1,%rax + # No return value mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 @@ -785,7 +785,7 @@ }}} {{{ ###################################################################### -# int bn_sqr8x_mont( +# void bn_sqr8x_mont( my $rptr="%rdi"; # const BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $mulx_adx_capable="%rdx"; # Different than upstream! @@ -976,7 +976,7 @@ add \$32,$num jnz .Lsqr8x_cond_copy - mov \$1,%rax + # No return value mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 @@ -1345,7 +1345,7 @@ mov %rdx,($tptr) - mov \$1,%rax + # No return value mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index 0a9997d..6ce2cc7 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h
@@ -275,14 +275,14 @@ #define OPENSSL_BN_ASM_MONT // bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words // long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the -// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles -// inputs of this size and zero otherwise. +// corresponding field in |BN_MONT_CTX|. // // If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced. // If neither is fully-reduced, the output may not be either. // // This function allocates |num| words on the stack, so |num| should be at most -// |BN_MONTGOMERY_MAX_WORDS|. +// |BN_MONTGOMERY_MAX_WORDS|. Additionally, |num| must be at least 128 / +// |BN_BITS2|. // // TODO(davidben): The x86_64 implementation expects a 32-bit input and masks // off upper bits. The aarch64 implementation expects a 64-bit input and does @@ -291,39 +291,39 @@ // // See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word // inputs. -int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); #if defined(OPENSSL_X86_64) inline int bn_mulx_adx_capable(void) { // MULX is in BMI2. return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable(); } -int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); inline int bn_mul4x_mont_capable(size_t num) { return num >= 8 && (num & 3) == 0; } -int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); inline int bn_mulx4x_mont_capable(size_t num) { return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable(); } -int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); inline int bn_sqr8x_mont_capable(size_t num) { return num >= 8 && (num & 7) == 0; } -int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); #elif defined(OPENSSL_ARM) inline int bn_mul8x_mont_neon_capable(size_t num) { return (num & 7) == 0 && CRYPTO_is_NEON_capable(); } -int bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); -int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); +void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num); #endif #endif // OPENSSL_BN_ASM_MONT
diff --git a/crypto/fipsmodule/bn/montgomery.cc.inc b/crypto/fipsmodule/bn/montgomery.cc.inc index e73f805..c0adcca 100644 --- a/crypto/fipsmodule/bn/montgomery.cc.inc +++ b/crypto/fipsmodule/bn/montgomery.cc.inc
@@ -324,7 +324,7 @@ } #if defined(OPENSSL_BN_ASM_MONT) - // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86. + // |bn_mul_mont| requires at least 128 bits of limbs. int num = mont->N.width; if (num >= (128 / BN_BITS2) && a->width == num && b->width == num) { if (!bn_wexpand(r, num)) { @@ -333,12 +333,7 @@ // This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont| // allocates |num| words on the stack, so |num| cannot be too large. assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS); - if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) { - // The check above ensures this won't happen. - assert(0); - OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); - return 0; - } + bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num); r->neg = 0; r->width = num; return 1; @@ -379,11 +374,9 @@ } #if defined(OPENSSL_BN_ASM_MONT) - // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86. + // |bn_mul_mont| requires at least 128 bits of limbs. if (num >= (128 / BN_BITS2)) { - if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) { - abort(); // The check above ensures this won't happen. - } + bn_mul_mont(r, a, b, mont->N.d, mont->n0, num); return; } #endif @@ -404,27 +397,27 @@ } #if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64) -int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, size_t num) { if (ap == bp && bn_sqr8x_mont_capable(num)) { - return bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num); + bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num); + } else if (bn_mulx4x_mont_capable(num)) { + bn_mulx4x_mont(rp, ap, bp, np, n0, num); + } else if (bn_mul4x_mont_capable(num)) { + bn_mul4x_mont(rp, ap, bp, np, n0, num); + } else { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } - if (bn_mulx4x_mont_capable(num)) { - return bn_mulx4x_mont(rp, ap, bp, np, n0, num); - } - if (bn_mul4x_mont_capable(num)) { - return bn_mul4x_mont(rp, ap, bp, np, n0, num); - } - return bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } #endif #if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM) -int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num) { +void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, size_t num) { if (bn_mul8x_mont_neon_capable(num)) { - return bn_mul8x_mont_neon(rp, ap, bp, np, n0, num); + bn_mul8x_mont_neon(rp, ap, bp, np, n0, num); + } else { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } - return bn_mul_mont_nohw(rp, ap, bp, np, n0, num); } #endif
diff --git a/gen/bcm/armv4-mont-linux.S b/gen/bcm/armv4-mont-linux.S index 704f607..73e42b5 100644 --- a/gen/bcm/armv4-mont-linux.S +++ b/gen/bcm/armv4-mont-linux.S
@@ -24,14 +24,9 @@ bn_mul_mont_nohw: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block - cmp ip,#2 + @ No return value. Instead, the caller must ensure num >= 2 mov r0,ip @ load num -#ifdef __thumb2__ - ittt lt -#endif - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt + @ No return value stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers @@ -165,8 +160,7 @@ add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: + @ No return value #if __ARM_ARCH>=5 bx lr @ bx lr #else @@ -929,6 +923,7 @@ mov sp,ip vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + @ No return value bx lr @ bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif
diff --git a/gen/bcm/armv8-mont-apple.S b/gen/bcm/armv8-mont-apple.S index 9e39c1d..6aad968 100644 --- a/gen/bcm/armv8-mont-apple.S +++ b/gen/bcm/armv8-mont-apple.S
@@ -206,7 +206,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER @@ -965,7 +965,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] @@ -1408,7 +1408,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-linux.S b/gen/bcm/armv8-mont-linux.S index 168162f..e49322b 100644 --- a/gen/bcm/armv8-mont-linux.S +++ b/gen/bcm/armv8-mont-linux.S
@@ -206,7 +206,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER @@ -965,7 +965,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] @@ -1408,7 +1408,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-win.S b/gen/bcm/armv8-mont-win.S index b521d49..4091a5a 100644 --- a/gen/bcm/armv8-mont-win.S +++ b/gen/bcm/armv8-mont-win.S
@@ -208,7 +208,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldr x29,[sp],#64 AARCH64_VALIDATE_LINK_REGISTER @@ -969,7 +969,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] @@ -1414,7 +1414,7 @@ ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] - mov x0,#1 + // No return value ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80]
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S index a8fd1f9..6e549c7 100644 --- a/gen/bcm/x86-mont-apple.S +++ b/gen/bcm/x86-mont-apple.S
@@ -14,10 +14,7 @@ pushl %ebx pushl %esi pushl %edi - xorl %eax,%eax movl 40(%esp),%edi - cmpl $4,%edi - jl L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx addl $2,%edi @@ -40,15 +37,15 @@ leal (%ebp,%eax,1),%esp movl (%esp),%eax cmpl %ebp,%esp - ja L001page_walk - jmp L002page_walk_done + ja L000page_walk + jmp L001page_walk_done .align 4,0x90 -L001page_walk: +L000page_walk: leal -4096(%esp),%esp movl (%esp),%eax cmpl %ebp,%esp - ja L001page_walk -L002page_walk_done: + ja L000page_walk +L001page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -85,7 +82,7 @@ psrlq $32,%mm3 incl %ecx .align 4,0x90 -L0031st: +L0021st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -100,7 +97,7 @@ psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl L0031st + jl L0021st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -114,7 +111,7 @@ paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -L004outer: +L003outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -136,7 +133,7 @@ paddq %mm6,%mm2 incl %ecx decl %ebx -L005inner: +L004inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -153,7 +150,7 @@ paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz L005inner + jnz L004inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -171,11 +168,11 @@ movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle L004outer + jle L003outer emms - jmp L006common_tail + jmp L005common_tail .align 4,0x90 -L006common_tail: +L005common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -183,19 +180,19 @@ movl %ebx,%ecx xorl %edx,%edx .align 4,0x90 -L007sub: +L006sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge L007sub + jge L006sub sbbl $0,%eax movl $-1,%edx xorl %eax,%edx - jmp L008copy + jmp L007copy .align 4,0x90 -L008copy: +L007copy: movl 32(%esp,%ebx,4),%esi movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) @@ -204,10 +201,8 @@ orl %esi,%ebp movl %ebp,(%edi,%ebx,4) decl %ebx - jge L008copy + jge L007copy movl 24(%esp),%esp - movl $1,%eax -L000just_leave: popl %edi popl %esi popl %ebx
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S index 3d3ddb5..21fbee2 100644 --- a/gen/bcm/x86-mont-linux.S +++ b/gen/bcm/x86-mont-linux.S
@@ -15,10 +15,7 @@ pushl %ebx pushl %esi pushl %edi - xorl %eax,%eax movl 40(%esp),%edi - cmpl $4,%edi - jl .L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx addl $2,%edi @@ -41,15 +38,15 @@ leal (%ebp,%eax,1),%esp movl (%esp),%eax cmpl %ebp,%esp - ja .L001page_walk - jmp .L002page_walk_done + ja .L000page_walk + jmp .L001page_walk_done .align 16 -.L001page_walk: +.L000page_walk: leal -4096(%esp),%esp movl (%esp),%eax cmpl %ebp,%esp - ja .L001page_walk -.L002page_walk_done: + ja .L000page_walk +.L001page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx @@ -86,7 +83,7 @@ psrlq $32,%mm3 incl %ecx .align 16 -.L0031st: +.L0021st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -101,7 +98,7 @@ psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl .L0031st + jl .L0021st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -115,7 +112,7 @@ paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -.L004outer: +.L003outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -137,7 +134,7 @@ paddq %mm6,%mm2 incl %ecx decl %ebx -.L005inner: +.L004inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -154,7 +151,7 @@ paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz .L005inner + jnz .L004inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -172,11 +169,11 @@ movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle .L004outer + jle .L003outer emms - jmp .L006common_tail + jmp .L005common_tail .align 16 -.L006common_tail: +.L005common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -184,19 +181,19 @@ movl %ebx,%ecx xorl %edx,%edx .align 16 -.L007sub: +.L006sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge .L007sub + jge .L006sub sbbl $0,%eax movl $-1,%edx xorl %eax,%edx - jmp .L008copy + jmp .L007copy .align 16 -.L008copy: +.L007copy: movl 32(%esp,%ebx,4),%esi movl (%edi,%ebx,4),%ebp movl %ecx,32(%esp,%ebx,4) @@ -205,10 +202,8 @@ orl %esi,%ebp movl %ebp,(%edi,%ebx,4) decl %ebx - jge .L008copy + jge .L007copy movl 24(%esp),%esp - movl $1,%eax -.L000just_leave: popl %edi popl %esi popl %ebx
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm index 931275d..d154078 100644 --- a/gen/bcm/x86-mont-win.asm +++ b/gen/bcm/x86-mont-win.asm
@@ -21,10 +21,7 @@ push ebx push esi push edi - xor eax,eax mov edi,DWORD [40+esp] - cmp edi,4 - jl NEAR L$000just_leave lea esi,[20+esp] lea edx,[24+esp] add edi,2 @@ -47,15 +44,15 @@ lea esp,[eax*1+ebp] mov eax,DWORD [esp] cmp esp,ebp - ja NEAR L$001page_walk - jmp NEAR L$002page_walk_done + ja NEAR L$000page_walk + jmp NEAR L$001page_walk_done align 16 -L$001page_walk: +L$000page_walk: lea esp,[esp-4096] mov eax,DWORD [esp] cmp esp,ebp - ja NEAR L$001page_walk -L$002page_walk_done: + ja NEAR L$000page_walk +L$001page_walk_done: mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] @@ -92,7 +89,7 @@ psrlq mm3,32 inc ecx align 16 -L$0031st: +L$0021st: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -107,7 +104,7 @@ psrlq mm3,32 lea ecx,[1+ecx] cmp ecx,ebx - jl NEAR L$0031st + jl NEAR L$0021st pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -121,7 +118,7 @@ paddq mm3,mm2 movq [32+ebx*4+esp],mm3 inc edx -L$004outer: +L$003outer: xor ecx,ecx movd mm4,DWORD [edx*4+edi] movd mm5,DWORD [esi] @@ -143,7 +140,7 @@ paddq mm2,mm6 inc ecx dec ebx -L$005inner: +L$004inner: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -160,7 +157,7 @@ paddq mm2,mm6 dec ebx lea ecx,[1+ecx] - jnz NEAR L$005inner + jnz NEAR L$004inner mov ebx,ecx pmuludq mm0,mm4 pmuludq mm1,mm5 @@ -178,11 +175,11 @@ movq [32+ebx*4+esp],mm3 lea edx,[1+edx] cmp edx,ebx - jle NEAR L$004outer + jle NEAR L$003outer emms - jmp NEAR L$006common_tail + jmp NEAR L$005common_tail align 16 -L$006common_tail: +L$005common_tail: mov ebp,DWORD [16+esp] mov edi,DWORD [4+esp] lea esi,[32+esp] @@ -190,19 +187,19 @@ mov ecx,ebx xor edx,edx align 16 -L$007sub: +L$006sub: sbb eax,DWORD [edx*4+ebp] mov DWORD [edx*4+edi],eax dec ecx mov eax,DWORD [4+edx*4+esi] lea edx,[1+edx] - jge NEAR L$007sub + jge NEAR L$006sub sbb eax,0 mov edx,-1 xor edx,eax - jmp NEAR L$008copy + jmp NEAR L$007copy align 16 -L$008copy: +L$007copy: mov esi,DWORD [32+ebx*4+esp] mov ebp,DWORD [ebx*4+edi] mov DWORD [32+ebx*4+esp],ecx @@ -211,10 +208,8 @@ or ebp,esi mov DWORD [ebx*4+edi],ebp dec ebx - jge NEAR L$008copy + jge NEAR L$007copy mov esp,DWORD [24+esp] - mov eax,1 -L$000just_leave: pop edi pop esi pop ebx
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S index d429f7c..27a168d 100644 --- a/gen/bcm/x86_64-mont-apple.S +++ b/gen/bcm/x86_64-mont-apple.S
@@ -229,7 +229,7 @@ movq 8(%rsp,%r9,8),%rsi - movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 @@ -662,7 +662,7 @@ jnz L$copy4x movq 8(%rsp,%r9,8),%rsi - movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 @@ -853,7 +853,7 @@ addq $32,%r9 jnz L$sqr8x_cond_copy - movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 @@ -1211,7 +1211,7 @@ movq %rdx,(%rbx) - movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S index 630bb72..51c4b6c 100644 --- a/gen/bcm/x86_64-mont-linux.S +++ b/gen/bcm/x86_64-mont-linux.S
@@ -229,7 +229,7 @@ movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi,8 - movq $1,%rax + movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 @@ -662,7 +662,7 @@ jnz .Lcopy4x movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 - movq $1,%rax + movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 @@ -855,7 +855,7 @@ addq $32,%r9 jnz .Lsqr8x_cond_copy - movq $1,%rax + movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 @@ -1213,7 +1213,7 @@ movq %rdx,(%rbx) - movq $1,%rax + movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm index 7e54c66..c768d16 100644 --- a/gen/bcm/x86_64-mont-win.asm +++ b/gen/bcm/x86_64-mont-win.asm
@@ -248,7 +248,7 @@ mov rsi,QWORD[8+r9*8+rsp] - mov rax,1 + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] @@ -694,7 +694,7 @@ jnz NEAR $L$copy4x mov rsi,QWORD[8+r9*8+rsp] - mov rax,1 + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] @@ -898,7 +898,7 @@ add r9,32 jnz NEAR $L$sqr8x_cond_copy - mov rax,1 + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] @@ -1269,7 +1269,7 @@ mov QWORD[rbx],rdx - mov rax,1 + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi]